001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.util.cr; 025 026import java.io.File; 027import java.io.IOException; 028import java.util.ArrayList; 029import java.util.Arrays; 030import java.util.Iterator; 031import java.util.List; 032import java.util.regex.Pattern; 033 034import org.apache.commons.io.filefilter.AndFileFilter; 035import org.apache.commons.io.filefilter.IOFileFilter; 036import org.apache.commons.io.filefilter.NameFileFilter; 037import org.apache.commons.io.filefilter.OrFileFilter; 038import org.apache.commons.io.filefilter.SuffixFileFilter; 039import org.apache.commons.io.filefilter.TrueFileFilter; 040import org.apache.uima.UimaContext; 041import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 042import org.apache.uima.cas.CAS; 043import org.apache.uima.collection.CollectionException; 044import org.apache.uima.collection.CollectionReader; 045import org.apache.uima.collection.CollectionReaderDescription; 046import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 047import org.apache.uima.fit.component.ViewCreatorAnnotator; 048import org.apache.uima.fit.descriptor.ConfigurationParameter; 049import org.apache.uima.fit.descriptor.SofaCapability; 050import org.apache.uima.fit.factory.CollectionReaderFactory; 051import org.apache.uima.jcas.JCas; 052import org.apache.uima.pear.util.FileUtil; 053import org.apache.uima.resource.ResourceInitializationException; 054import org.apache.uima.util.FileUtils; 055import org.apache.uima.util.Progress; 056import org.apache.uima.util.ProgressImpl; 057import org.cleartk.util.ViewUriUtil; 058 059/** 060 * <br> 061 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 062 * All rights reserved. 063 * <p> 064 * 065 * A CollectionReader that loads all files in a directory tree. 066 * 067 * Files are loaded as plain text and stored in the JCas view selected by the user. ClearTK Document 068 * objects are added to the same JCas view to record the file IDs and paths. 069 * 070 * @author Steven Bethard 071 * @author Philip Ogren 072 */ 073@SofaCapability(outputSofas = ViewUriUtil.URI) 074public class FilesCollectionReader extends JCasCollectionReader_ImplBase { 075 076 public static CollectionReaderDescription getDescription(String fileOrDir) 077 throws ResourceInitializationException { 078 return CollectionReaderFactory.createReaderDescription( 079 FilesCollectionReader.class, 080 null, 081 PARAM_ROOT_FILE, 082 fileOrDir); 083 } 084 085 public static CollectionReader getCollectionReader(String fileOrDir) 086 throws ResourceInitializationException { 087 return CollectionReaderFactory.createReader(getDescription(fileOrDir)); 088 } 089 090 public static CollectionReaderDescription getDescriptionWithView(String dir, String viewName) 091 throws ResourceInitializationException { 092 return CollectionReaderFactory.createReaderDescription( 093 FilesCollectionReader.class, 094 PARAM_ROOT_FILE, 095 dir, 096 PARAM_VIEW_NAME, 097 viewName); 098 } 099 100 public static CollectionReader getCollectionReaderWithView(String dir, String viewName) 101 throws ResourceInitializationException { 102 return CollectionReaderFactory.createReader(getDescriptionWithView(dir, viewName)); 103 } 104 105 public static CollectionReaderDescription getDescriptionWithPatterns( 106 String dir, 107 String viewName, 108 String... patterns) throws ResourceInitializationException { 109 return CollectionReaderFactory.createReaderDescription( 110 FilesCollectionReader.class, 111 PARAM_ROOT_FILE, 112 dir, 113 PARAM_VIEW_NAME, 114 viewName, 115 PARAM_PATTERNS, 116 patterns); 117 } 118 119 public static CollectionReader getCollectionReaderWithPatterns( 120 String dir, 121 String viewName, 122 String... patterns) throws ResourceInitializationException { 123 return CollectionReaderFactory.createReader(getDescriptionWithPatterns(dir, viewName, patterns)); 124 } 125 126 public static CollectionReaderDescription getDescriptionWithSuffixes( 127 String dir, 128 String viewName, 129 String... suffixes) throws ResourceInitializationException { 130 return CollectionReaderFactory.createReaderDescription( 131 FilesCollectionReader.class, 132 PARAM_ROOT_FILE, 133 dir, 134 PARAM_VIEW_NAME, 135 viewName, 136 PARAM_SUFFIXES, 137 suffixes); 138 } 139 140 public static CollectionReader getCollectionReaderWithSuffixes( 141 String dir, 142 String viewName, 143 String... suffixes) throws ResourceInitializationException { 144 return CollectionReaderFactory.createReader(getDescriptionWithSuffixes(dir, viewName, suffixes)); 145 } 146 147 public static final String PARAM_ROOT_FILE = "rootFile"; 148 149 @ConfigurationParameter( 150 name = PARAM_ROOT_FILE, 151 mandatory = true, 152 description = "takes either the name of a single file or the root directory containing all the files to be processed.") 153 protected File rootFile; 154 155 public static final String PARAM_VIEW_NAME = "viewName"; 156 157 @ConfigurationParameter( 158 name = PARAM_VIEW_NAME, 159 mandatory = false, 160 description = "takes the the name that should be given to the JCas view that the document texts should be set to.", 161 defaultValue = CAS.NAME_DEFAULT_SOFA) 162 private String viewName; 163 164 public static final String PARAM_LANGUAGE = "language"; 165 166 @ConfigurationParameter( 167 name = PARAM_LANGUAGE, 168 mandatory = false, 169 description = "takes the language code corresponding to the language of the documents being examined. The value of this parameter " 170 + "is simply passed on to JCas.setDocumentLanguage(String).") 171 private String language; 172 173 public static final String PARAM_ENCODING = "encoding"; 174 175 @ConfigurationParameter( 176 name = PARAM_ENCODING, 177 mandatory = false, 178 description = "takes the encoding of the text files (e.g. \"UTF-8\"). See javadoc for java.nio.charset.Charset for a list of encoding names.") 179 private String encoding; 180 181 public static final String PARAM_SUFFIXES = "suffixes"; 182 183 @ConfigurationParameter( 184 name = PARAM_SUFFIXES, 185 mandatory = false, 186 description = "takes suffixes (e.g. .txt) of the files that should be read in.") 187 private String[] suffixes; 188 189 public static final String PARAM_PATTERNS = "patterns"; 190 191 @ConfigurationParameter( 192 name = PARAM_PATTERNS, 193 mandatory = false, 194 description = " takes regular expressions for matching the files that should be read in. Note that these will be searched for" 195 + " using java.util. regex.Matcher.find, so if you want to make sure the entire file name matches a pattern, you should start the string with ^ and end the" 196 + " string with $.") 197 private String[] patterns; 198 199 public static final String PARAM_NAME_FILES_FILE_NAMES = "nameFilesFileNames"; 200 201 @ConfigurationParameter( 202 name = PARAM_NAME_FILES_FILE_NAMES, 203 mandatory = false, 204 description = "names files which contain lists of file names. For example, if the value 'mydata/mylist.txt' is provided, " 205 + "then the file 'mylist.txt' should contain a line delimited list of file names. The file names in the list should not have directory information " 206 + "but should just be the names of the files. The directory is determined by 'rootFile' and the files that are processed result from " 207 + "traversing the directory structure provided and looking for files with a name found in the lists of file names. That is, no exception will be " 208 + "thrown if a file name in the list does not actually correspond to a file.") 209 private String[] nameFilesFileNames; 210 211 public static final String PARAM_FILE_NAMES = "fileNames"; 212 213 @ConfigurationParameter( 214 name = PARAM_FILE_NAMES, 215 mandatory = false, 216 description = "provides a list of file names that should be read in. The directory of the file names is determined by " 217 + "'rootFile' and the files that are processed result from traversing the directory structure provided and looking for files with a name found in the list of file names. " 218 + "That is, no exception will be thrown if a file name in the list does not actually correspond to a file.") 219 private String[] fileNames; 220 221 public static final String PARAM_IGNORE_SYSTEM_FILES = "ignoreSystemFiles"; 222 223 @ConfigurationParameter( 224 name = PARAM_IGNORE_SYSTEM_FILES, 225 mandatory = false, 226 description = "This parameter provides a flag that determines whether file iteration will traverse into directories that begin with a period '.' - to loosely correspond to 'system' files. Setting this parameter to true will not cause file names that begin with a period to be ignored - just directories. ") 227 private boolean ignoreSystemFiles = true; 228 229 protected Iterator<File> files; 230 231 protected File currentFile; 232 233 protected int completed = 0; 234 235 protected int filesCount = 0; 236 237 @Override 238 public void initialize(UimaContext context) throws ResourceInitializationException { 239 // raise an exception if the root file does not exist 240 if (!this.rootFile.exists()) { 241 String format = "file or directory %s does not exist"; 242 String message = String.format(format, rootFile.getPath()); 243 throw new ResourceInitializationException(new IOException(message)); 244 } 245 246 if (rootFile.isFile()) { 247 files = Arrays.asList(rootFile).iterator(); 248 filesCount = 1; 249 } else { 250 251 files = createFileIterator(); 252 filesCount = countFiles(createFileIterator()); 253 } 254 } 255 256 protected Iterator<File> createFileIterator() throws ResourceInitializationException { 257 IOFileFilter fileFilter = TrueFileFilter.INSTANCE; 258 259 if (suffixes != null) { 260 fileFilter = new AndFileFilter(fileFilter, new SuffixFileFilter(suffixes)); 261 } 262 263 if (patterns != null && patterns.length > 0) { 264 265 IOFileFilter patternFilter = new RegexFileFilter(Pattern.compile(patterns[0])); 266 if (patterns.length > 1) { 267 for (int i = 1; i < patterns.length; i++) { 268 patternFilter = new OrFileFilter(patternFilter, new RegexFileFilter(patterns[i])); 269 } 270 } 271 fileFilter = new AndFileFilter(fileFilter, patternFilter); 272 273 } 274 275 if (nameFilesFileNames != null) { 276 List<String> fileNamesFromLists = new ArrayList<String>(); 277 try { 278 for (String fileNamesList : nameFilesFileNames) { 279 fileNamesFromLists.addAll(Arrays.asList(FileUtil.loadListOfStrings(new File(fileNamesList)))); 280 } 281 fileFilter = new AndFileFilter(fileFilter, new NameFileFilter(fileNamesFromLists)); 282 } catch (IOException ioe) { 283 throw new ResourceInitializationException(ioe); 284 } 285 } 286 287 if (fileNames != null) { 288 fileFilter = new AndFileFilter(fileFilter, new NameFileFilter(fileNames)); 289 } 290 291 IOFileFilter directoryFilter = TrueFileFilter.INSTANCE; 292 293 if (ignoreSystemFiles) { 294 directoryFilter = new RegexFileFilter("^[^\\.].*$"); 295 fileFilter = new AndFileFilter(fileFilter, new RegexFileFilter("^[^\\.].*$")); 296 } 297 298 return org.apache.commons.io.FileUtils.iterateFiles(rootFile, fileFilter, directoryFilter); 299 300 } 301 302 public void getNext(JCas jCas) throws IOException, CollectionException { 303 if (!hasNext()) { 304 throw new RuntimeException("getNext(jCas) was called but hasNext() returns false"); 305 } 306 // get a JCas object 307 JCas view; 308 try { 309 view = ViewCreatorAnnotator.createViewSafely(jCas, this.viewName); 310 } catch (AnalysisEngineProcessException e) { 311 throw new CollectionException(e); 312 } 313 314 // set the document's text 315 String text = FileUtils.file2String(currentFile, this.encoding); 316 view.setSofaDataString(text, "text/plain"); 317 318 // set language if it was specified 319 if (this.language != null) { 320 view.setDocumentLanguage(this.language); 321 } 322 323 // set the document URI 324 ViewUriUtil.setURI(jCas, currentFile.toURI()); 325 326 completed++; 327 currentFile = null; 328 } 329 330 protected int countFiles(Iterator<File> tempFiles) { 331 int count = 0; 332 while (tempFiles.hasNext()) { 333 File file = tempFiles.next(); 334 if (file.isFile()) 335 count++; 336 } 337 return count; 338 } 339 340 public Progress[] getProgress() { 341 Progress progress = new ProgressImpl(completed, filesCount, Progress.ENTITIES); 342 return new Progress[] { progress }; 343 } 344 345 public boolean hasNext() throws IOException, CollectionException { 346 if (currentFile != null) { 347 return true; 348 } 349 while (this.files.hasNext()) { 350 currentFile = files.next(); 351 if (currentFile.isFile()) { 352 return true; 353 } 354 } 355 return false; 356 } 357 358 public void close() throws IOException { 359 } 360}