001/*
002 * Copyright 2015 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.fcrepo.migration.foxml;
017
018import org.apache.commons.io.FileUtils;
019import org.apache.lucene.analysis.Analyzer;
020import org.apache.lucene.analysis.standard.StandardAnalyzer;
021import org.apache.lucene.document.Document;
022import org.apache.lucene.document.Field;
023import org.apache.lucene.document.StringField;
024import org.apache.lucene.index.DirectoryReader;
025import org.apache.lucene.index.IndexReader;
026import org.apache.lucene.index.IndexWriter;
027import org.apache.lucene.index.IndexWriterConfig;
028import org.apache.lucene.index.Term;
029import org.apache.lucene.search.IndexSearcher;
030import org.apache.lucene.search.TermQuery;
031import org.apache.lucene.search.TopDocs;
032import org.apache.lucene.store.Directory;
033import org.apache.lucene.store.FSDirectory;
034import org.slf4j.Logger;
035
036import java.io.File;
037import java.io.IOException;
038
039import static org.slf4j.LoggerFactory.getLogger;
040
041/**
042 * An InternalIDResolver implementation that generates an index of
043 * datastream ids (filenames) to file paths for the contents of a
044 * datastream directory.  The directory is expected to contain just
045 * other directories and/or FOXML files.  The FOXML files are expected
046 * to have a filename that is reversibly mapped from a fedora internal
047 * id for that datastream version.
048 * @author mdurbin
049 */
050public abstract class DirectoryScanningIDResolver implements InternalIDResolver {
051
052    private static final Logger LOGGER = getLogger(InternalIDResolver.class);
053
054    /**
055     * A lucene IndexSearcher over an index maintained by this class.
056     * For every file found in the datastream directory a document exists
057     * in this index that contains an "id" field and a "path" field.  The
058     * id field is the internal id, the path field is the full path to the
059     * file containing that datastream content.
060     */
061    private final IndexSearcher searcher;
062    private final IndexReader reader;
063    private final FSDirectory fsDirectory;
064
065    /**
066     * directory scanning ID resolver
067     * @param cachedIndexDir the index directory.  If it exists, the old cache will be used, if it doesn't a new
068     *                 cache will be built at that location.  If it is null, a new cache will be built in
069     *                 the temp file space that will be deleted upon application shutdown.
070     * @param dsRoot the datastream root
071     * @throws IOException IO exception creating temp and index files/directories
072     */
073    public DirectoryScanningIDResolver(final File cachedIndexDir, final File dsRoot) throws IOException {
074        final File indexDir;
075        if (cachedIndexDir == null) {
076            final File temp = File.createTempFile("tempfile", "basedir");
077            temp.delete();
078            temp.mkdir();
079            indexDir = new File(temp, "index");
080            LOGGER.info("No index directory specified.  Creating temporary index at \""
081                    + indexDir.getAbsolutePath() + "\".");
082            Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
083                @Override
084                public void run() {
085                    try {
086                        if (searcher != null) {
087                            searcher.getIndexReader().close();
088                        }
089                        LOGGER.info("Deleting index directory at \"" + indexDir.getAbsolutePath() + "\"...");
090                        FileUtils.deleteDirectory(indexDir);
091                    } catch (IOException e) {
092                        LOGGER.error("Unable to delete index directory at \"" + indexDir.getAbsolutePath() + "\"!", e);
093                        e.printStackTrace();
094                    }
095                }
096            }));
097        } else {
098            indexDir = cachedIndexDir;
099        }
100
101        // Index dir exists and is non-empty
102        if (indexDir.exists() && indexDir.list().length > 0) {
103            LOGGER.warn("Index exists at \"" + indexDir.getPath() + "\" and will be used.  "
104                    + "To clear index, simply delete this directory and re-run the application.");
105        } else {
106            final Analyzer analyzer = new StandardAnalyzer();
107            final IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
108            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
109
110            try (final Directory dir = FSDirectory.open(indexDir.toPath());
111                 final IndexWriter writer = new IndexWriter(dir, iwc)) {
112
113                LOGGER.info("Building an index of all the datastreams in \"" + dsRoot.getPath() + "\"...");
114                indexDatastreams(writer, dsRoot);
115
116                writer.commit();
117
118            }
119        }
120
121        fsDirectory = FSDirectory.open(indexDir.toPath());
122        reader = DirectoryReader.open(fsDirectory);
123        searcher = new IndexSearcher(reader);
124    }
125
126    public void close() throws IOException {
127        reader.close();
128        fsDirectory.close();
129    }
130
131    @Override
132    public CachedContent resolveInternalID(final String id) {
133        try {
134            final TopDocs result = searcher.search(new TermQuery(new Term("id", id)), 2);
135            if (result.totalHits == 1) {
136                return new FileCachedContent(new File(searcher.doc(result.scoreDocs[0].doc).get("path")));
137            } else if (result.totalHits < 1) {
138                throw new RuntimeException("Unable to resolve internal ID \"" + id + "\"!");
139            } else {
140                throw new IllegalStateException(result.totalHits + " files matched the internal id \"" + id + "\".  ("
141                        + searcher.doc(result.scoreDocs[0].doc).get("path") + ", "
142                        + searcher.doc(result.scoreDocs[1].doc).get("path") + "...)");
143            }
144        } catch (final IOException e) {
145            throw new RuntimeException(e);
146        }
147    }
148
149    private void indexDatastreams(final IndexWriter writer, final File f) throws IOException {
150        if (f.isDirectory()) {
151            for (final File child : f.listFiles()) {
152                indexDatastreams(writer, child);
153            }
154        } else {
155            final Document doc = new Document();
156            doc.add(new StringField("path", f.getPath(), Field.Store.YES));
157            doc.add(new StringField("id", getInternalIdForFile(f), Field.Store.YES));
158            LOGGER.trace("Added \"{}\" for: {}", getInternalIdForFile(f), f.getPath());
159            writer.addDocument(doc);
160        }
161    }
162
163    /**
164     * Determines the internal id for the given file.
165     *
166     * @param f file to check for
167     * @return string containing internal id for the file
168     */
169    protected abstract String getInternalIdForFile(File f);
170
171}