001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.util.cr;
025
026import java.io.File;
027import java.io.IOException;
028import java.util.ArrayList;
029import java.util.Arrays;
030import java.util.Iterator;
031import java.util.List;
032import java.util.regex.Pattern;
033
034import org.apache.commons.io.filefilter.AndFileFilter;
035import org.apache.commons.io.filefilter.IOFileFilter;
036import org.apache.commons.io.filefilter.NameFileFilter;
037import org.apache.commons.io.filefilter.OrFileFilter;
038import org.apache.commons.io.filefilter.SuffixFileFilter;
039import org.apache.commons.io.filefilter.TrueFileFilter;
040import org.apache.uima.UimaContext;
041import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
042import org.apache.uima.cas.CAS;
043import org.apache.uima.collection.CollectionException;
044import org.apache.uima.collection.CollectionReader;
045import org.apache.uima.collection.CollectionReaderDescription;
046import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
047import org.apache.uima.fit.component.ViewCreatorAnnotator;
048import org.apache.uima.fit.descriptor.ConfigurationParameter;
049import org.apache.uima.fit.descriptor.SofaCapability;
050import org.apache.uima.fit.factory.CollectionReaderFactory;
051import org.apache.uima.jcas.JCas;
052import org.apache.uima.pear.util.FileUtil;
053import org.apache.uima.resource.ResourceInitializationException;
054import org.apache.uima.util.FileUtils;
055import org.apache.uima.util.Progress;
056import org.apache.uima.util.ProgressImpl;
057import org.cleartk.util.ViewUriUtil;
058
059/**
060 * <br>
061 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
062 * All rights reserved.
063 * <p>
064 * 
065 * A CollectionReader that loads all files in a directory tree.
066 * 
067 * Files are loaded as plain text and stored in the JCas view selected by the user. ClearTK Document
068 * objects are added to the same JCas view to record the file IDs and paths.
069 * 
070 * @author Steven Bethard
071 * @author Philip Ogren
072 */
073@SofaCapability(outputSofas = ViewUriUtil.URI)
074public class FilesCollectionReader extends JCasCollectionReader_ImplBase {
075
076  public static CollectionReaderDescription getDescription(String fileOrDir)
077      throws ResourceInitializationException {
078    return CollectionReaderFactory.createReaderDescription(
079        FilesCollectionReader.class,
080        null,
081        PARAM_ROOT_FILE,
082        fileOrDir);
083  }
084
085  public static CollectionReader getCollectionReader(String fileOrDir)
086      throws ResourceInitializationException {
087    return CollectionReaderFactory.createReader(getDescription(fileOrDir));
088  }
089
090  public static CollectionReaderDescription getDescriptionWithView(String dir, String viewName)
091      throws ResourceInitializationException {
092    return CollectionReaderFactory.createReaderDescription(
093        FilesCollectionReader.class,
094        PARAM_ROOT_FILE,
095        dir,
096        PARAM_VIEW_NAME,
097        viewName);
098  }
099
100  public static CollectionReader getCollectionReaderWithView(String dir, String viewName)
101      throws ResourceInitializationException {
102    return CollectionReaderFactory.createReader(getDescriptionWithView(dir, viewName));
103  }
104
105  public static CollectionReaderDescription getDescriptionWithPatterns(
106      String dir,
107      String viewName,
108      String... patterns) throws ResourceInitializationException {
109    return CollectionReaderFactory.createReaderDescription(
110        FilesCollectionReader.class,
111        PARAM_ROOT_FILE,
112        dir,
113        PARAM_VIEW_NAME,
114        viewName,
115        PARAM_PATTERNS,
116        patterns);
117  }
118
119  public static CollectionReader getCollectionReaderWithPatterns(
120      String dir,
121      String viewName,
122      String... patterns) throws ResourceInitializationException {
123    return CollectionReaderFactory.createReader(getDescriptionWithPatterns(dir, viewName, patterns));
124  }
125
126  public static CollectionReaderDescription getDescriptionWithSuffixes(
127      String dir,
128      String viewName,
129      String... suffixes) throws ResourceInitializationException {
130    return CollectionReaderFactory.createReaderDescription(
131        FilesCollectionReader.class,
132        PARAM_ROOT_FILE,
133        dir,
134        PARAM_VIEW_NAME,
135        viewName,
136        PARAM_SUFFIXES,
137        suffixes);
138  }
139
140  public static CollectionReader getCollectionReaderWithSuffixes(
141      String dir,
142      String viewName,
143      String... suffixes) throws ResourceInitializationException {
144    return CollectionReaderFactory.createReader(getDescriptionWithSuffixes(dir, viewName, suffixes));
145  }
146
147  public static final String PARAM_ROOT_FILE = "rootFile";
148
149  @ConfigurationParameter(
150      name = PARAM_ROOT_FILE,
151      mandatory = true,
152      description = "takes either the name of a single file or the root directory containing all the files to be processed.")
153  protected File rootFile;
154
155  public static final String PARAM_VIEW_NAME = "viewName";
156
157  @ConfigurationParameter(
158      name = PARAM_VIEW_NAME,
159      mandatory = false,
160      description = "takes the the name that should be given to the JCas view that the document texts should be set to.",
161      defaultValue = CAS.NAME_DEFAULT_SOFA)
162  private String viewName;
163
164  public static final String PARAM_LANGUAGE = "language";
165
166  @ConfigurationParameter(
167      name = PARAM_LANGUAGE,
168      mandatory = false,
169      description = "takes the language code corresponding to the language of the documents being examined.  The value of this parameter "
170          + "is simply passed on to JCas.setDocumentLanguage(String).")
171  private String language;
172
173  public static final String PARAM_ENCODING = "encoding";
174
175  @ConfigurationParameter(
176      name = PARAM_ENCODING,
177      mandatory = false,
178      description = "takes the encoding of the text files (e.g. \"UTF-8\").  See javadoc for java.nio.charset.Charset for a list of encoding names.")
179  private String encoding;
180
181  public static final String PARAM_SUFFIXES = "suffixes";
182
183  @ConfigurationParameter(
184      name = PARAM_SUFFIXES,
185      mandatory = false,
186      description = "takes suffixes (e.g. .txt) of the files that should be read in.")
187  private String[] suffixes;
188
189  public static final String PARAM_PATTERNS = "patterns";
190
191  @ConfigurationParameter(
192      name = PARAM_PATTERNS,
193      mandatory = false,
194      description = "   takes regular expressions for matching the files that should be read in. Note that these will be searched for"
195          + " using java.util. regex.Matcher.find, so if you want to make sure the entire file name matches a pattern, you should start the string with ^ and end the"
196          + " string with $.")
197  private String[] patterns;
198
199  public static final String PARAM_NAME_FILES_FILE_NAMES = "nameFilesFileNames";
200
201  @ConfigurationParameter(
202      name = PARAM_NAME_FILES_FILE_NAMES,
203      mandatory = false,
204      description = "names files which contain lists of file names. For example, if the value 'mydata/mylist.txt' is provided, "
205          + "then the file 'mylist.txt' should contain a line delimited list of file names.  The file names in the list should not have directory information "
206          + "but should just be the names of the files. The directory is determined by 'rootFile' and the files that are processed result from "
207          + "traversing the directory structure provided and looking for files with a name found in the lists of file names. That is, no exception will be "
208          + "thrown if a file name in the list does not actually correspond to a file.")
209  private String[] nameFilesFileNames;
210
211  public static final String PARAM_FILE_NAMES = "fileNames";
212
213  @ConfigurationParameter(
214      name = PARAM_FILE_NAMES,
215      mandatory = false,
216      description = "provides a list of file names that should be read in. The directory of the file names is determined by "
217          + "'rootFile' and the files that are processed result from traversing the directory structure provided and looking for files with a name found in the list of file names. "
218          + "That is, no exception will be thrown if a file name in the list does not actually correspond to a file.")
219  private String[] fileNames;
220
221  public static final String PARAM_IGNORE_SYSTEM_FILES = "ignoreSystemFiles";
222
223  @ConfigurationParameter(
224      name = PARAM_IGNORE_SYSTEM_FILES,
225      mandatory = false,
226      description = "This parameter provides a flag that determines whether file iteration will traverse into directories that begin with a period '.' - to loosely correspond to 'system' files.  Setting this parameter to true will not cause file names that begin with a period to be ignored - just directories. ")
227  private boolean ignoreSystemFiles = true;
228
229  protected Iterator<File> files;
230
231  protected File currentFile;
232
233  protected int completed = 0;
234
235  protected int filesCount = 0;
236
237  @Override
238  public void initialize(UimaContext context) throws ResourceInitializationException {
239    // raise an exception if the root file does not exist
240    if (!this.rootFile.exists()) {
241      String format = "file or directory %s does not exist";
242      String message = String.format(format, rootFile.getPath());
243      throw new ResourceInitializationException(new IOException(message));
244    }
245
246    if (rootFile.isFile()) {
247      files = Arrays.asList(rootFile).iterator();
248      filesCount = 1;
249    } else {
250
251      files = createFileIterator();
252      filesCount = countFiles(createFileIterator());
253    }
254  }
255
256  protected Iterator<File> createFileIterator() throws ResourceInitializationException {
257    IOFileFilter fileFilter = TrueFileFilter.INSTANCE;
258
259    if (suffixes != null) {
260      fileFilter = new AndFileFilter(fileFilter, new SuffixFileFilter(suffixes));
261    }
262
263    if (patterns != null && patterns.length > 0) {
264
265      IOFileFilter patternFilter = new RegexFileFilter(Pattern.compile(patterns[0]));
266      if (patterns.length > 1) {
267        for (int i = 1; i < patterns.length; i++) {
268          patternFilter = new OrFileFilter(patternFilter, new RegexFileFilter(patterns[i]));
269        }
270      }
271      fileFilter = new AndFileFilter(fileFilter, patternFilter);
272
273    }
274
275    if (nameFilesFileNames != null) {
276      List<String> fileNamesFromLists = new ArrayList<String>();
277      try {
278        for (String fileNamesList : nameFilesFileNames) {
279          fileNamesFromLists.addAll(Arrays.asList(FileUtil.loadListOfStrings(new File(fileNamesList))));
280        }
281        fileFilter = new AndFileFilter(fileFilter, new NameFileFilter(fileNamesFromLists));
282      } catch (IOException ioe) {
283        throw new ResourceInitializationException(ioe);
284      }
285    }
286
287    if (fileNames != null) {
288      fileFilter = new AndFileFilter(fileFilter, new NameFileFilter(fileNames));
289    }
290
291    IOFileFilter directoryFilter = TrueFileFilter.INSTANCE;
292
293    if (ignoreSystemFiles) {
294      directoryFilter = new RegexFileFilter("^[^\\.].*$");
295      fileFilter = new AndFileFilter(fileFilter, new RegexFileFilter("^[^\\.].*$"));
296    }
297
298    return org.apache.commons.io.FileUtils.iterateFiles(rootFile, fileFilter, directoryFilter);
299
300  }
301
302  public void getNext(JCas jCas) throws IOException, CollectionException {
303    if (!hasNext()) {
304      throw new RuntimeException("getNext(jCas) was called but hasNext() returns false");
305    }
306    // get a JCas object
307    JCas view;
308    try {
309      view = ViewCreatorAnnotator.createViewSafely(jCas, this.viewName);
310    } catch (AnalysisEngineProcessException e) {
311      throw new CollectionException(e);
312    }
313
314    // set the document's text
315    String text = FileUtils.file2String(currentFile, this.encoding);
316    view.setSofaDataString(text, "text/plain");
317
318    // set language if it was specified
319    if (this.language != null) {
320      view.setDocumentLanguage(this.language);
321    }
322
323    // set the document URI
324    ViewUriUtil.setURI(jCas, currentFile.toURI());
325
326    completed++;
327    currentFile = null;
328  }
329
330  protected int countFiles(Iterator<File> tempFiles) {
331    int count = 0;
332    while (tempFiles.hasNext()) {
333      File file = tempFiles.next();
334      if (file.isFile())
335        count++;
336    }
337    return count;
338  }
339
340  public Progress[] getProgress() {
341    Progress progress = new ProgressImpl(completed, filesCount, Progress.ENTITIES);
342    return new Progress[] { progress };
343  }
344
345  public boolean hasNext() throws IOException, CollectionException {
346    if (currentFile != null) {
347      return true;
348    }
349    while (this.files.hasNext()) {
350      currentFile = files.next();
351      if (currentFile.isFile()) {
352        return true;
353      }
354    }
355    return false;
356  }
357
358  public void close() throws IOException {
359  }
360}