001/* 002 * Copyright 2015 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.fcrepo.migration.foxml; 017 018import org.apache.commons.io.FileUtils; 019import org.apache.lucene.analysis.Analyzer; 020import org.apache.lucene.analysis.standard.StandardAnalyzer; 021import org.apache.lucene.document.Document; 022import org.apache.lucene.document.Field; 023import org.apache.lucene.document.StringField; 024import org.apache.lucene.index.DirectoryReader; 025import org.apache.lucene.index.IndexReader; 026import org.apache.lucene.index.IndexWriter; 027import org.apache.lucene.index.IndexWriterConfig; 028import org.apache.lucene.index.Term; 029import org.apache.lucene.search.IndexSearcher; 030import org.apache.lucene.search.TermQuery; 031import org.apache.lucene.search.TopDocs; 032import org.apache.lucene.store.Directory; 033import org.apache.lucene.store.FSDirectory; 034import org.slf4j.Logger; 035 036import java.io.File; 037import java.io.IOException; 038 039import static org.slf4j.LoggerFactory.getLogger; 040 041/** 042 * An InternalIDResolver implementation that generates an index of 043 * datastream ids (filenames) to file paths for the contents of a 044 * datastream directory. The directory is expected to contain just 045 * other directories and/or FOXML files. The FOXML files are expected 046 * to have a filename that is reversibly mapped from a fedora internal 047 * id for that datastream version. 048 * @author mdurbin 049 */ 050public abstract class DirectoryScanningIDResolver implements InternalIDResolver { 051 052 private static final Logger LOGGER = getLogger(InternalIDResolver.class); 053 054 /** 055 * A lucene IndexSearcher over an index maintained by this class. 056 * For every file found in the datastream directory a document exists 057 * in this index that contains an "id" field and a "path" field. The 058 * id field is the internal id, the path field is the full path to the 059 * file containing that datastream content. 060 */ 061 private final IndexSearcher searcher; 062 063 /** 064 * directory scanning ID resolver 065 * @param cachedIndexDir the index directory. If it exists, the old cache will be used, if it doesn't a new 066 * cache will be built at that location. If it is null, a new cache will be built in 067 * the temp file space that will be deleted upon application shutdown. 068 * @param dsRoot the datastream root 069 * @throws IOException IO exception creating temp and index files/directories 070 */ 071 public DirectoryScanningIDResolver(final File cachedIndexDir, final File dsRoot) throws IOException { 072 final File indexDir; 073 if (cachedIndexDir == null) { 074 final File temp = File.createTempFile("tempfile", "basedir"); 075 temp.delete(); 076 temp.mkdir(); 077 indexDir = new File(temp, "index"); 078 LOGGER.info("No index directory specified. Creating temporary index at \"" 079 + indexDir.getAbsolutePath() + "\"."); 080 Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { 081 @Override 082 public void run() { 083 try { 084 if (searcher != null) { 085 searcher.getIndexReader().close(); 086 } 087 LOGGER.info("Deleting index directory at \"" + indexDir.getAbsolutePath() + "\"..."); 088 FileUtils.deleteDirectory(indexDir); 089 } catch (IOException e) { 090 LOGGER.error("Unable to delete index directory at \"" + indexDir.getAbsolutePath() + "\"!", e); 091 e.printStackTrace(); 092 } 093 } 094 })); 095 } else { 096 indexDir = cachedIndexDir; 097 } 098 099 // Index dir exists and is non-empty 100 if (indexDir.exists() && indexDir.list().length > 0) { 101 LOGGER.warn("Index exists at \"" + indexDir.getPath() + "\" and will be used. " 102 + "To clear index, simply delete this directory and re-run the application."); 103 } else { 104 final Analyzer analyzer = new StandardAnalyzer(); 105 final IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 106 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 107 108 try (final Directory dir = FSDirectory.open(indexDir.toPath()); 109 final IndexWriter writer = new IndexWriter(dir, iwc)) { 110 111 LOGGER.info("Building an index of all the datastreams in \"" + dsRoot.getPath() + "\"..."); 112 indexDatastreams(writer, dsRoot); 113 114 writer.commit(); 115 116 } 117 } 118 119 final IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir.toPath())); 120 searcher = new IndexSearcher(reader); 121 } 122 123 @Override 124 public CachedContent resolveInternalID(final String id) { 125 try { 126 final TopDocs result = searcher.search(new TermQuery(new Term("id", id)), 2); 127 if (result.totalHits == 1) { 128 return new FileCachedContent(new File(searcher.doc(result.scoreDocs[0].doc).get("path"))); 129 } else if (result.totalHits < 1) { 130 throw new RuntimeException("Unable to resolve internal ID \"" + id + "\"!"); 131 } else { 132 throw new IllegalStateException(result.totalHits + " files matched the internal id \"" + id + "\". (" 133 + searcher.doc(result.scoreDocs[0].doc).get("path") + ", " 134 + searcher.doc(result.scoreDocs[1].doc).get("path") + "...)"); 135 } 136 } catch (final IOException e) { 137 throw new RuntimeException(e); 138 } 139 } 140 141 private void indexDatastreams(final IndexWriter writer, final File f) throws IOException { 142 if (f.isDirectory()) { 143 for (final File child : f.listFiles()) { 144 indexDatastreams(writer, child); 145 } 146 } else { 147 final Document doc = new Document(); 148 doc.add(new StringField("path", f.getPath(), Field.Store.YES)); 149 doc.add(new StringField("id", getInternalIdForFile(f), Field.Store.YES)); 150 LOGGER.trace("Added \"{}\" for: {}", getInternalIdForFile(f), f.getPath()); 151 writer.addDocument(doc); 152 } 153 } 154 155 /** 156 * Determines the internal id for the given file. 157 * 158 * @param f file to check for 159 * @return string containing internal id for the file 160 */ 161 protected abstract String getInternalIdForFile(File f); 162 163}