001/* 002 * Copyright 2015 DuraSpace, Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.fcrepo.migration.foxml; 017 018import org.apache.commons.io.FileUtils; 019import org.apache.lucene.analysis.Analyzer; 020import org.apache.lucene.analysis.standard.StandardAnalyzer; 021import org.apache.lucene.document.Document; 022import org.apache.lucene.document.Field; 023import org.apache.lucene.document.StringField; 024import org.apache.lucene.index.DirectoryReader; 025import org.apache.lucene.index.IndexReader; 026import org.apache.lucene.index.IndexWriter; 027import org.apache.lucene.index.IndexWriterConfig; 028import org.apache.lucene.index.Term; 029import org.apache.lucene.search.IndexSearcher; 030import org.apache.lucene.search.TermQuery; 031import org.apache.lucene.search.TopDocs; 032import org.apache.lucene.store.Directory; 033import org.apache.lucene.store.FSDirectory; 034import org.slf4j.Logger; 035 036import java.io.File; 037import java.io.IOException; 038 039import static org.slf4j.LoggerFactory.getLogger; 040 041/** 042 * An InternalIDResolver implementation that generates an index of 043 * datastream ids (filenames) to file paths for the contents of a 044 * datastream directory. The directory is expected to contain just 045 * other directories and/or FOXML files. The FOXML files are expected 046 * to have a filename that is reversibly mapped from a fedora internal 047 * id for that datastream version. 048 * @author mdurbin 049 */ 050public abstract class DirectoryScanningIDResolver implements InternalIDResolver { 051 052 private static final Logger LOGGER = getLogger(InternalIDResolver.class); 053 054 /** 055 * A lucene IndexSearcher over an index maintained by this class. 056 * For every file found in the datastream directory a document exists 057 * in this index that contains an "id" field and a "path" field. The 058 * id field is the internal id, the path field is the full path to the 059 * file containing that datastream content. 060 */ 061 private final IndexSearcher searcher; 062 private final IndexReader reader; 063 private final FSDirectory fsDirectory; 064 065 /** 066 * directory scanning ID resolver 067 * @param cachedIndexDir the index directory. If it exists, the old cache will be used, if it doesn't a new 068 * cache will be built at that location. If it is null, a new cache will be built in 069 * the temp file space that will be deleted upon application shutdown. 070 * @param dsRoot the datastream root 071 * @throws IOException IO exception creating temp and index files/directories 072 */ 073 public DirectoryScanningIDResolver(final File cachedIndexDir, final File dsRoot) throws IOException { 074 final File indexDir; 075 if (cachedIndexDir == null) { 076 final File temp = File.createTempFile("tempfile", "basedir"); 077 temp.delete(); 078 temp.mkdir(); 079 indexDir = new File(temp, "index"); 080 LOGGER.info("No index directory specified. Creating temporary index at \"" 081 + indexDir.getAbsolutePath() + "\"."); 082 Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { 083 @Override 084 public void run() { 085 try { 086 if (searcher != null) { 087 searcher.getIndexReader().close(); 088 } 089 LOGGER.info("Deleting index directory at \"" + indexDir.getAbsolutePath() + "\"..."); 090 FileUtils.deleteDirectory(indexDir); 091 } catch (IOException e) { 092 LOGGER.error("Unable to delete index directory at \"" + indexDir.getAbsolutePath() + "\"!", e); 093 e.printStackTrace(); 094 } 095 } 096 })); 097 } else { 098 indexDir = cachedIndexDir; 099 } 100 101 // Index dir exists and is non-empty 102 if (indexDir.exists() && indexDir.list().length > 0) { 103 LOGGER.warn("Index exists at \"" + indexDir.getPath() + "\" and will be used. " 104 + "To clear index, simply delete this directory and re-run the application."); 105 } else { 106 final Analyzer analyzer = new StandardAnalyzer(); 107 final IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 108 iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 109 110 try (final Directory dir = FSDirectory.open(indexDir.toPath()); 111 final IndexWriter writer = new IndexWriter(dir, iwc)) { 112 113 LOGGER.info("Building an index of all the datastreams in \"" + dsRoot.getPath() + "\"..."); 114 indexDatastreams(writer, dsRoot); 115 116 writer.commit(); 117 118 } 119 } 120 121 fsDirectory = FSDirectory.open(indexDir.toPath()); 122 reader = DirectoryReader.open(fsDirectory); 123 searcher = new IndexSearcher(reader); 124 } 125 126 public void close() throws IOException { 127 reader.close(); 128 fsDirectory.close(); 129 } 130 131 @Override 132 public CachedContent resolveInternalID(final String id) { 133 try { 134 final TopDocs result = searcher.search(new TermQuery(new Term("id", id)), 2); 135 if (result.totalHits == 1) { 136 return new FileCachedContent(new File(searcher.doc(result.scoreDocs[0].doc).get("path"))); 137 } else if (result.totalHits < 1) { 138 throw new RuntimeException("Unable to resolve internal ID \"" + id + "\"!"); 139 } else { 140 throw new IllegalStateException(result.totalHits + " files matched the internal id \"" + id + "\". (" 141 + searcher.doc(result.scoreDocs[0].doc).get("path") + ", " 142 + searcher.doc(result.scoreDocs[1].doc).get("path") + "...)"); 143 } 144 } catch (final IOException e) { 145 throw new RuntimeException(e); 146 } 147 } 148 149 private void indexDatastreams(final IndexWriter writer, final File f) throws IOException { 150 if (f.isDirectory()) { 151 for (final File child : f.listFiles()) { 152 indexDatastreams(writer, child); 153 } 154 } else { 155 final Document doc = new Document(); 156 doc.add(new StringField("path", f.getPath(), Field.Store.YES)); 157 doc.add(new StringField("id", getInternalIdForFile(f), Field.Store.YES)); 158 LOGGER.trace("Added \"{}\" for: {}", getInternalIdForFile(f), f.getPath()); 159 writer.addDocument(doc); 160 } 161 } 162 163 /** 164 * Determines the internal id for the given file. 165 * 166 * @param f file to check for 167 * @return string containing internal id for the file 168 */ 169 protected abstract String getInternalIdForFile(File f); 170 171}