001/*
002 * Copyright 2015 DuraSpace, Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package org.fcrepo.migration.foxml;
017
018import org.apache.commons.io.FileUtils;
019import org.apache.lucene.analysis.Analyzer;
020import org.apache.lucene.analysis.standard.StandardAnalyzer;
021import org.apache.lucene.document.Document;
022import org.apache.lucene.document.Field;
023import org.apache.lucene.document.StringField;
024import org.apache.lucene.index.DirectoryReader;
025import org.apache.lucene.index.IndexReader;
026import org.apache.lucene.index.IndexWriter;
027import org.apache.lucene.index.IndexWriterConfig;
028import org.apache.lucene.index.Term;
029import org.apache.lucene.search.IndexSearcher;
030import org.apache.lucene.search.TermQuery;
031import org.apache.lucene.search.TopDocs;
032import org.apache.lucene.store.Directory;
033import org.apache.lucene.store.FSDirectory;
034import org.slf4j.Logger;
035
036import java.io.File;
037import java.io.IOException;
038
039import static org.slf4j.LoggerFactory.getLogger;
040
041/**
042 * An InternalIDResolver implementation that generates an index of
043 * datastream ids (filenames) to file paths for the contents of a
044 * datastream directory.  The directory is expected to contain just
045 * other directories and/or FOXML files.  The FOXML files are expected
046 * to have a filename that is reversibly mapped from a fedora internal
047 * id for that datastream version.
048 * @author mdurbin
049 */
050public abstract class DirectoryScanningIDResolver implements InternalIDResolver {
051
052    private static final Logger LOGGER = getLogger(InternalIDResolver.class);
053
054    /**
055     * A lucene IndexSearcher over an index maintained by this class.
056     * For every file found in the datastream directory a document exists
057     * in this index that contains an "id" field and a "path" field.  The
058     * id field is the internal id, the path field is the full path to the
059     * file containing that datastream content.
060     */
061    private final IndexSearcher searcher;
062
063    /**
064     * directory scanning ID resolver
065     * @param cachedIndexDir the index directory.  If it exists, the old cache will be used, if it doesn't a new
066     *                 cache will be built at that location.  If it is null, a new cache will be built in
067     *                 the temp file space that will be deleted upon application shutdown.
068     * @param dsRoot the datastream root
069     * @throws IOException IO exception creating temp and index files/directories
070     */
071    public DirectoryScanningIDResolver(final File cachedIndexDir, final File dsRoot) throws IOException {
072        final File indexDir;
073        if (cachedIndexDir == null) {
074            final File temp = File.createTempFile("tempfile", "basedir");
075            temp.delete();
076            temp.mkdir();
077            indexDir = new File(temp, "index");
078            LOGGER.info("No index directory specified.  Creating temporary index at \""
079                    + indexDir.getAbsolutePath() + "\".");
080            Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
081                @Override
082                public void run() {
083                    try {
084                        if (searcher != null) {
085                            searcher.getIndexReader().close();
086                        }
087                        LOGGER.info("Deleting index directory at \"" + indexDir.getAbsolutePath() + "\"...");
088                        FileUtils.deleteDirectory(indexDir);
089                    } catch (IOException e) {
090                        LOGGER.error("Unable to delete index directory at \"" + indexDir.getAbsolutePath() + "\"!", e);
091                        e.printStackTrace();
092                    }
093                }
094            }));
095        } else {
096            indexDir = cachedIndexDir;
097        }
098
099        // Index dir exists and is non-empty
100        if (indexDir.exists() && indexDir.list().length > 0) {
101            LOGGER.warn("Index exists at \"" + indexDir.getPath() + "\" and will be used.  "
102                    + "To clear index, simply delete this directory and re-run the application.");
103        } else {
104            final Analyzer analyzer = new StandardAnalyzer();
105            final IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
106            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
107
108            try (final Directory dir = FSDirectory.open(indexDir.toPath());
109                 final IndexWriter writer = new IndexWriter(dir, iwc)) {
110
111                LOGGER.info("Building an index of all the datastreams in \"" + dsRoot.getPath() + "\"...");
112                indexDatastreams(writer, dsRoot);
113
114                writer.commit();
115
116            }
117        }
118
119        final IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir.toPath()));
120        searcher = new IndexSearcher(reader);
121    }
122
123    @Override
124    public CachedContent resolveInternalID(final String id) {
125        try {
126            final TopDocs result = searcher.search(new TermQuery(new Term("id", id)), 2);
127            if (result.totalHits == 1) {
128                return new FileCachedContent(new File(searcher.doc(result.scoreDocs[0].doc).get("path")));
129            } else if (result.totalHits < 1) {
130                throw new RuntimeException("Unable to resolve internal ID \"" + id + "\"!");
131            } else {
132                throw new IllegalStateException(result.totalHits + " files matched the internal id \"" + id + "\".  ("
133                        + searcher.doc(result.scoreDocs[0].doc).get("path") + ", "
134                        + searcher.doc(result.scoreDocs[1].doc).get("path") + "...)");
135            }
136        } catch (final IOException e) {
137            throw new RuntimeException(e);
138        }
139    }
140
141    private void indexDatastreams(final IndexWriter writer, final File f) throws IOException {
142        if (f.isDirectory()) {
143            for (final File child : f.listFiles()) {
144                indexDatastreams(writer, child);
145            }
146        } else {
147            final Document doc = new Document();
148            doc.add(new StringField("path", f.getPath(), Field.Store.YES));
149            doc.add(new StringField("id", getInternalIdForFile(f), Field.Store.YES));
150            LOGGER.trace("Added \"{}\" for: {}", getInternalIdForFile(f), f.getPath());
151            writer.addDocument(doc);
152        }
153    }
154
155    /**
156     * Determines the internal id for the given file.
157     *
158     * @param f file to check for
159     * @return string containing internal id for the file
160     */
161    protected abstract String getInternalIdForFile(File f);
162
163}