001/**
002 * The contents of this file are subject to the license and copyright
003 * detailed in the LICENSE and NOTICE files at the root of the source
004 * tree.
005 *
006 */
007package org.fcrepo.migration.foxml;
008
009import org.apache.commons.io.FileUtils;
010import org.apache.lucene.analysis.Analyzer;
011import org.apache.lucene.analysis.standard.StandardAnalyzer;
012import org.apache.lucene.document.Document;
013import org.apache.lucene.document.Field;
014import org.apache.lucene.document.StringField;
015import org.apache.lucene.index.DirectoryReader;
016import org.apache.lucene.index.IndexReader;
017import org.apache.lucene.index.IndexWriter;
018import org.apache.lucene.index.IndexWriterConfig;
019import org.apache.lucene.index.Term;
020import org.apache.lucene.search.IndexSearcher;
021import org.apache.lucene.search.TermQuery;
022import org.apache.lucene.search.TopDocs;
023import org.apache.lucene.store.Directory;
024import org.apache.lucene.store.FSDirectory;
025import org.slf4j.Logger;
026
027import java.io.File;
028import java.io.IOException;
029
030import static org.slf4j.LoggerFactory.getLogger;
031
032/**
033 * An InternalIDResolver implementation that generates an index of
034 * datastream ids (filenames) to file paths for the contents of a
035 * datastream directory.  The directory is expected to contain just
036 * other directories and/or FOXML files.  The FOXML files are expected
037 * to have a filename that is reversibly mapped from a fedora internal
038 * id for that datastream version.
039 * @author mdurbin
040 */
041public abstract class DirectoryScanningIDResolver implements InternalIDResolver {
042
043    private static final Logger LOGGER = getLogger(InternalIDResolver.class);
044
045    /**
046     * A lucene IndexSearcher over an index maintained by this class.
047     * For every file found in the datastream directory a document exists
048     * in this index that contains an "id" field and a "path" field.  The
049     * id field is the internal id, the path field is the full path to the
050     * file containing that datastream content.
051     */
052    private final IndexSearcher searcher;
053    private final IndexReader reader;
054    private final FSDirectory fsDirectory;
055
056    /**
057     * directory scanning ID resolver
058     * @param cachedIndexDir the index directory.  If it exists, the old cache will be used, if it doesn't a new
059     *                 cache will be built at that location.  If it is null, a new cache will be built in
060     *                 the temp file space that will be deleted upon application shutdown.
061     * @param dsRoot the datastream root
062     * @throws IOException IO exception creating temp and index files/directories
063     */
064    public DirectoryScanningIDResolver(final File cachedIndexDir, final File dsRoot) throws IOException {
065        final File indexDir;
066        if (cachedIndexDir == null) {
067            final File temp = File.createTempFile("tempfile", "basedir");
068            temp.delete();
069            temp.mkdir();
070            indexDir = new File(temp, "index");
071            LOGGER.info("No index directory specified.  Creating temporary index at \""
072                    + indexDir.getAbsolutePath() + "\".");
073            Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
074                @Override
075                public void run() {
076                    try {
077                        if (searcher != null) {
078                            searcher.getIndexReader().close();
079                        }
080                        LOGGER.info("Deleting index directory at \"" + indexDir.getAbsolutePath() + "\"...");
081                        FileUtils.deleteDirectory(indexDir);
082                    } catch (IOException e) {
083                        LOGGER.error("Unable to delete index directory at \"" + indexDir.getAbsolutePath() + "\"!", e);
084                        e.printStackTrace();
085                    }
086                }
087            }));
088        } else {
089            indexDir = cachedIndexDir;
090        }
091
092        // Index dir exists and is non-empty
093        if (indexDir.exists() && indexDir.list().length > 0) {
094            LOGGER.warn("Index exists at \"" + indexDir.getPath() + "\" and will be used.  "
095                    + "To clear index, simply delete this directory and re-run the application.");
096        } else {
097            final Analyzer analyzer = new StandardAnalyzer();
098            final IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
099            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
100
101            try (final Directory dir = FSDirectory.open(indexDir.toPath());
102                 final IndexWriter writer = new IndexWriter(dir, iwc)) {
103
104                LOGGER.info("Building an index of all the datastreams in \"" + dsRoot.getPath() + "\"...");
105                indexDatastreams(writer, dsRoot);
106
107                writer.commit();
108
109            }
110        }
111
112        fsDirectory = FSDirectory.open(indexDir.toPath());
113        reader = DirectoryReader.open(fsDirectory);
114        searcher = new IndexSearcher(reader);
115    }
116
117    public void close() throws IOException {
118        reader.close();
119        fsDirectory.close();
120    }
121
122    @Override
123    public CachedContent resolveInternalID(final String id) {
124        try {
125            final TopDocs result = searcher.search(new TermQuery(new Term("id", id)), 2);
126            if (result.totalHits == 1) {
127                return new FileCachedContent(new File(searcher.doc(result.scoreDocs[0].doc).get("path")));
128            } else if (result.totalHits < 1) {
129                throw new RuntimeException("Unable to resolve internal ID \"" + id + "\"!");
130            } else {
131                throw new IllegalStateException(result.totalHits + " files matched the internal id \"" + id + "\".  ("
132                        + searcher.doc(result.scoreDocs[0].doc).get("path") + ", "
133                        + searcher.doc(result.scoreDocs[1].doc).get("path") + "...)");
134            }
135        } catch (final IOException e) {
136            throw new RuntimeException(e);
137        }
138    }
139
140    private void indexDatastreams(final IndexWriter writer, final File f) throws IOException {
141        if (f.isDirectory()) {
142            for (final File child : f.listFiles()) {
143                indexDatastreams(writer, child);
144            }
145        } else {
146            final Document doc = new Document();
147            doc.add(new StringField("path", f.getPath(), Field.Store.YES));
148            doc.add(new StringField("id", getInternalIdForFile(f), Field.Store.YES));
149            LOGGER.trace("Added \"{}\" for: {}", getInternalIdForFile(f), f.getPath());
150            writer.addDocument(doc);
151        }
152    }
153
154    /**
155     * Determines the internal id for the given file.
156     *
157     * @param f file to check for
158     * @return string containing internal id for the file
159     */
160    protected abstract String getInternalIdForFile(File f);
161
162}