001/** 
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.util.cr;
025
026import java.io.File;
027import java.io.FileFilter;
028import java.io.IOException;
029import java.net.URI;
030import java.net.URISyntaxException;
031import java.util.ArrayList;
032import java.util.Collection;
033import java.util.Iterator;
034
035import org.apache.commons.io.FileUtils;
036import org.apache.commons.io.filefilter.FileFilterUtils;
037import org.apache.commons.io.filefilter.HiddenFileFilter;
038import org.apache.commons.io.filefilter.IOFileFilter;
039import org.apache.uima.UimaContext;
040import org.apache.uima.collection.CollectionException;
041import org.apache.uima.collection.CollectionReader;
042import org.apache.uima.collection.CollectionReaderDescription;
043import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
044import org.apache.uima.fit.descriptor.ConfigurationParameter;
045import org.apache.uima.fit.factory.CollectionReaderFactory;
046import org.apache.uima.jcas.JCas;
047import org.apache.uima.resource.ResourceInitializationException;
048import org.apache.uima.util.Progress;
049import org.apache.uima.util.ProgressImpl;
050import org.cleartk.util.ViewUriUtil;
051
052import com.google.common.base.Function;
053import com.google.common.collect.Iterables;
054
055/**
056 * <br>
057 * Copyright (c) 2012, Regents of the University of Colorado <br>
058 * All rights reserved.
059 * <p>
060 * 
061 * A CollectionReader that populates the default sofa with URI. This can accept a Collection of
062 * Files, Collection of URIs or a single directory. If given a directory it will create a jCas for
063 * each file within the directory. Recursion is controlled using the directoryFilter parameter. By
064 * default this will reject system files and recurse into subdirectories.
065 * <p>
066 * This should be used in conjunction with UriToDocumentTextAnnotator or UriToXmiCasAnnotator
067 * 
068 * @author Lee Becker
069 * 
070 */
071public class UriCollectionReader extends JCasCollectionReader_ImplBase {
072
073  public static class RejectSystemFiles implements IOFileFilter {
074    FileFilter f = FileFilterUtils.fileFileFilter();
075
076    @Override
077    public boolean accept(File file) {
078      return FileFilterUtils.fileFileFilter().accept(file) && HiddenFileFilter.VISIBLE.accept(file);
079    }
080
081    @Override
082    public boolean accept(File dir, String name) {
083      File file = new File(dir, name);
084      return FileFilterUtils.directoryFileFilter().accept(file)
085          && HiddenFileFilter.VISIBLE.accept(file) && this.accept(file);
086    }
087  }
088
089  public static class RejectSystemDirectories implements IOFileFilter {
090
091    @Override
092    public boolean accept(File file) {
093      return FileFilterUtils.directoryFileFilter().accept(file)
094          && HiddenFileFilter.VISIBLE.accept(file);
095    }
096
097    @Override
098    public boolean accept(File dir, String name) {
099      File file = new File(dir, name);
100      return FileFilterUtils.directoryFileFilter().accept(file)
101          && HiddenFileFilter.VISIBLE.accept(file) && this.accept(file);
102    }
103  }
104
105  public static CollectionReaderDescription getDescriptionFromDirectory(File directory)
106      throws ResourceInitializationException {
107    return CollectionReaderFactory.createReaderDescription(
108        UriCollectionReader.class,
109        null,
110        PARAM_DIRECTORY,
111        directory);
112  }
113
114  public static CollectionReaderDescription getDescriptionFromDirectory(
115      File directory,
116      Class<? extends IOFileFilter> fileFilterClass,
117      Class<? extends IOFileFilter> dirFilterClass) throws ResourceInitializationException {
118    return CollectionReaderFactory.createReaderDescription(
119        UriCollectionReader.class,
120        null,
121        PARAM_DIRECTORY,
122        directory,
123        PARAM_FILE_FILTER_CLASS,
124        fileFilterClass,
125        PARAM_DIRECTORY_FILTER_CLASS,
126        dirFilterClass);
127  }
128
129  public static CollectionReader getCollectionReaderFromDirectory(File directory)
130      throws ResourceInitializationException {
131    return CollectionReaderFactory.createReader(getDescriptionFromDirectory(directory));
132  }
133
134  public static CollectionReader getCollectionReaderFromDirectory(
135      File directory,
136      Class<? extends IOFileFilter> fileFilterClass,
137      Class<? extends IOFileFilter> dirFilterClass) throws ResourceInitializationException {
138    return CollectionReaderFactory.createReader(getDescriptionFromDirectory(
139        directory,
140        fileFilterClass,
141        dirFilterClass));
142  }
143
144  public static CollectionReaderDescription getDescriptionFromFiles(Collection<File> files)
145      throws ResourceInitializationException {
146
147    return CollectionReaderFactory.createReaderDescription(
148        UriCollectionReader.class,
149        null,
150        PARAM_FILES,
151        files);
152  }
153
154  public static CollectionReader getCollectionReaderFromFiles(Collection<File> files)
155      throws ResourceInitializationException {
156    return CollectionReaderFactory.createReader(getDescriptionFromFiles(files));
157  }
158
159  public static CollectionReaderDescription getDescriptionFromUris(Collection<URI> uris)
160      throws ResourceInitializationException {
161
162    return CollectionReaderFactory.createReaderDescription(
163        UriCollectionReader.class,
164        null,
165        PARAM_URIS,
166        uris);
167  }
168
169  public static CollectionReader getCollectionReaderFromUris(Collection<URI> uris)
170      throws ResourceInitializationException {
171    return CollectionReaderFactory.createReader(getDescriptionFromUris(uris));
172  }
173
174  public static final String PARAM_FILES = "files";
175
176  @ConfigurationParameter(
177      name = PARAM_FILES,
178      mandatory = false,
179      description = "provides a list of files whose URI should be written to the default sofa within the CAS")
180  private Collection<File> files = new ArrayList<File>();
181
182  public static final String PARAM_DIRECTORY = "directory";
183
184  @ConfigurationParameter(
185      name = PARAM_DIRECTORY,
186      mandatory = false,
187      description = "provids a directory containing files whose URIs should be written to the defaul sofa within the CAS")
188  private File directory = null;
189
190  public static final String PARAM_URIS = "uris";
191
192  @ConfigurationParameter(
193      name = PARAM_URIS,
194      mandatory = false,
195      description = "This parameter provides a list of URIs that should be written to the default sofa within the CAS.  Proper URI construction is the responsibility of the caller")
196  private Collection<URI> uris = new ArrayList<URI>();
197
198  public static final String PARAM_FILE_FILTER_CLASS = "fileFilterClass";
199
200  @ConfigurationParameter(
201      name = PARAM_FILE_FILTER_CLASS,
202      defaultValue = "org.cleartk.util.cr.UriCollectionReader.RejectSystemFiles",
203      mandatory = false,
204      description = "The class used for filtering files when PARAM_DIRECTORY is set")
205  private Class<? extends IOFileFilter> fileFilterClass;
206
207  public static final String PARAM_DIRECTORY_FILTER_CLASS = "directoryFilterClass";
208
209  @ConfigurationParameter(
210      name = PARAM_DIRECTORY_FILTER_CLASS,
211      defaultValue = "org.cleartk.util.cr.UriCollectionReader.RejectSystemDirectories",
212      mandatory = false,
213      description = "The class used for filtering sub-directories when PARAM_DIRECTORY is set.  To disable recursion, pass in a directory filter that rejects all directory files")
214  private Class<? extends IOFileFilter> directoryFilterClass;
215
216  protected Iterator<URI> uriIter;
217
218  protected int numUrisCompleted = 0;
219
220  protected int uriCount = 0;
221
222  protected Function<String, URI> stringToUri = new Function<String, URI>() {
223    @Override
224    public URI apply(String input) {
225      try {
226        return new URI(input);
227      } catch (URISyntaxException e) {
228        throw new RuntimeException(e);
229      }
230    }
231  };
232
233  protected Function<File, URI> fileToUri = new Function<File, URI>() {
234    @Override
235    public URI apply(File input) {
236      return input.toURI();
237    }
238  };
239
240  @Override
241  public void initialize(UimaContext context) throws ResourceInitializationException {
242
243    // Convert list of files to URIs
244    // Iterable<File> filteredFiles = Iterables.filter(this.files, this.directoryFilesFilter);
245    this.uriCount += this.files.size();
246    Iterable<URI> urisFromFiles = Iterables.transform(this.files, this.fileToUri);
247
248    // Read file names from directory and convert list of files to URI
249    Iterable<URI> urisFromDirectory = new ArrayList<URI>();
250    if (this.isDirectoryValid()) {
251      IOFileFilter fileFilter;
252      IOFileFilter directoryFilter;
253
254      try {
255        fileFilter = this.fileFilterClass.newInstance();
256        directoryFilter = this.directoryFilterClass.newInstance();
257      } catch (InstantiationException e) {
258        throw new ResourceInitializationException(e);
259      } catch (IllegalAccessException e) {
260        throw new ResourceInitializationException(e);
261      }
262
263      Collection<File> filesInDir = FileUtils.listFiles(this.directory, fileFilter, directoryFilter);
264      urisFromDirectory = Iterables.transform(filesInDir, this.fileToUri);
265      this.uriCount += filesInDir.size();
266    }
267
268    // Combine URI iterables from all conditions and initialize iterator
269    this.uriIter = Iterables.concat(this.uris, urisFromFiles, urisFromDirectory).iterator();
270  }
271
272  private boolean isDirectoryValid() throws ResourceInitializationException {
273    if (this.directory == null) {
274      return false;
275    }
276
277    if (!this.directory.exists()) {
278      String format = "Directory %s does not exist";
279      String message = String.format(format, directory.getPath());
280      throw new ResourceInitializationException(new IOException(message));
281    }
282
283    if (!this.directory.isDirectory()) {
284      String format = "Directory %s is not a directory.  For specific files set PARAM_FILES instead of PARAM_DIRECTORY.";
285      String message = String.format(format, directory.getPath());
286      throw new ResourceInitializationException(new IOException(message));
287    }
288    return true;
289  }
290
291  @Override
292  public boolean hasNext() throws IOException, CollectionException {
293    return this.uriIter.hasNext();
294  }
295
296  @Override
297  public Progress[] getProgress() {
298    Progress progress = new ProgressImpl(numUrisCompleted, uriCount, Progress.ENTITIES);
299    return new Progress[] { progress };
300  }
301
302  @Override
303  public void getNext(JCas jCas) throws IOException, CollectionException {
304    if (!this.hasNext()) {
305      throw new RuntimeException("getNext(jCas) was called but hasNext() returns false");
306    }
307
308    ViewUriUtil.setURI(jCas, this.uriIter.next());
309  }
310
311}