001/** 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.util.cr; 025 026import java.io.File; 027import java.io.FileFilter; 028import java.io.IOException; 029import java.net.URI; 030import java.net.URISyntaxException; 031import java.util.ArrayList; 032import java.util.Collection; 033import java.util.Iterator; 034 035import org.apache.commons.io.FileUtils; 036import org.apache.commons.io.filefilter.FileFilterUtils; 037import org.apache.commons.io.filefilter.HiddenFileFilter; 038import org.apache.commons.io.filefilter.IOFileFilter; 039import org.apache.uima.UimaContext; 040import org.apache.uima.collection.CollectionException; 041import org.apache.uima.collection.CollectionReader; 042import org.apache.uima.collection.CollectionReaderDescription; 043import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 044import org.apache.uima.fit.descriptor.ConfigurationParameter; 045import org.apache.uima.fit.factory.CollectionReaderFactory; 046import org.apache.uima.jcas.JCas; 047import org.apache.uima.resource.ResourceInitializationException; 048import org.apache.uima.util.Progress; 049import org.apache.uima.util.ProgressImpl; 050import org.cleartk.util.ViewUriUtil; 051 052import com.google.common.base.Function; 053import com.google.common.collect.Iterables; 054 055/** 056 * <br> 057 * Copyright (c) 2012, Regents of the University of Colorado <br> 058 * All rights reserved. 059 * <p> 060 * 061 * A CollectionReader that populates the default sofa with URI. This can accept a Collection of 062 * Files, Collection of URIs or a single directory. If given a directory it will create a jCas for 063 * each file within the directory. Recursion is controlled using the directoryFilter parameter. By 064 * default this will reject system files and recurse into subdirectories. 065 * <p> 066 * This should be used in conjunction with UriToDocumentTextAnnotator or UriToXmiCasAnnotator 067 * 068 * @author Lee Becker 069 * 070 */ 071public class UriCollectionReader extends JCasCollectionReader_ImplBase { 072 073 public static class RejectSystemFiles implements IOFileFilter { 074 FileFilter f = FileFilterUtils.fileFileFilter(); 075 076 @Override 077 public boolean accept(File file) { 078 return FileFilterUtils.fileFileFilter().accept(file) && HiddenFileFilter.VISIBLE.accept(file); 079 } 080 081 @Override 082 public boolean accept(File dir, String name) { 083 File file = new File(dir, name); 084 return FileFilterUtils.directoryFileFilter().accept(file) 085 && HiddenFileFilter.VISIBLE.accept(file) && this.accept(file); 086 } 087 } 088 089 public static class RejectSystemDirectories implements IOFileFilter { 090 091 @Override 092 public boolean accept(File file) { 093 return FileFilterUtils.directoryFileFilter().accept(file) 094 && HiddenFileFilter.VISIBLE.accept(file); 095 } 096 097 @Override 098 public boolean accept(File dir, String name) { 099 File file = new File(dir, name); 100 return FileFilterUtils.directoryFileFilter().accept(file) 101 && HiddenFileFilter.VISIBLE.accept(file) && this.accept(file); 102 } 103 } 104 105 public static CollectionReaderDescription getDescriptionFromDirectory(File directory) 106 throws ResourceInitializationException { 107 return CollectionReaderFactory.createReaderDescription( 108 UriCollectionReader.class, 109 null, 110 PARAM_DIRECTORY, 111 directory); 112 } 113 114 public static CollectionReaderDescription getDescriptionFromDirectory( 115 File directory, 116 Class<? extends IOFileFilter> fileFilterClass, 117 Class<? extends IOFileFilter> dirFilterClass) throws ResourceInitializationException { 118 return CollectionReaderFactory.createReaderDescription( 119 UriCollectionReader.class, 120 null, 121 PARAM_DIRECTORY, 122 directory, 123 PARAM_FILE_FILTER_CLASS, 124 fileFilterClass, 125 PARAM_DIRECTORY_FILTER_CLASS, 126 dirFilterClass); 127 } 128 129 public static CollectionReader getCollectionReaderFromDirectory(File directory) 130 throws ResourceInitializationException { 131 return CollectionReaderFactory.createReader(getDescriptionFromDirectory(directory)); 132 } 133 134 public static CollectionReader getCollectionReaderFromDirectory( 135 File directory, 136 Class<? extends IOFileFilter> fileFilterClass, 137 Class<? extends IOFileFilter> dirFilterClass) throws ResourceInitializationException { 138 return CollectionReaderFactory.createReader(getDescriptionFromDirectory( 139 directory, 140 fileFilterClass, 141 dirFilterClass)); 142 } 143 144 public static CollectionReaderDescription getDescriptionFromFiles(Collection<File> files) 145 throws ResourceInitializationException { 146 147 return CollectionReaderFactory.createReaderDescription( 148 UriCollectionReader.class, 149 null, 150 PARAM_FILES, 151 files); 152 } 153 154 public static CollectionReader getCollectionReaderFromFiles(Collection<File> files) 155 throws ResourceInitializationException { 156 return CollectionReaderFactory.createReader(getDescriptionFromFiles(files)); 157 } 158 159 public static CollectionReaderDescription getDescriptionFromUris(Collection<URI> uris) 160 throws ResourceInitializationException { 161 162 return CollectionReaderFactory.createReaderDescription( 163 UriCollectionReader.class, 164 null, 165 PARAM_URIS, 166 uris); 167 } 168 169 public static CollectionReader getCollectionReaderFromUris(Collection<URI> uris) 170 throws ResourceInitializationException { 171 return CollectionReaderFactory.createReader(getDescriptionFromUris(uris)); 172 } 173 174 public static final String PARAM_FILES = "files"; 175 176 @ConfigurationParameter( 177 name = PARAM_FILES, 178 mandatory = false, 179 description = "provides a list of files whose URI should be written to the default sofa within the CAS") 180 private Collection<File> files = new ArrayList<File>(); 181 182 public static final String PARAM_DIRECTORY = "directory"; 183 184 @ConfigurationParameter( 185 name = PARAM_DIRECTORY, 186 mandatory = false, 187 description = "provids a directory containing files whose URIs should be written to the defaul sofa within the CAS") 188 private File directory = null; 189 190 public static final String PARAM_URIS = "uris"; 191 192 @ConfigurationParameter( 193 name = PARAM_URIS, 194 mandatory = false, 195 description = "This parameter provides a list of URIs that should be written to the default sofa within the CAS. Proper URI construction is the responsibility of the caller") 196 private Collection<URI> uris = new ArrayList<URI>(); 197 198 public static final String PARAM_FILE_FILTER_CLASS = "fileFilterClass"; 199 200 @ConfigurationParameter( 201 name = PARAM_FILE_FILTER_CLASS, 202 defaultValue = "org.cleartk.util.cr.UriCollectionReader.RejectSystemFiles", 203 mandatory = false, 204 description = "The class used for filtering files when PARAM_DIRECTORY is set") 205 private Class<? extends IOFileFilter> fileFilterClass; 206 207 public static final String PARAM_DIRECTORY_FILTER_CLASS = "directoryFilterClass"; 208 209 @ConfigurationParameter( 210 name = PARAM_DIRECTORY_FILTER_CLASS, 211 defaultValue = "org.cleartk.util.cr.UriCollectionReader.RejectSystemDirectories", 212 mandatory = false, 213 description = "The class used for filtering sub-directories when PARAM_DIRECTORY is set. To disable recursion, pass in a directory filter that rejects all directory files") 214 private Class<? extends IOFileFilter> directoryFilterClass; 215 216 protected Iterator<URI> uriIter; 217 218 protected int numUrisCompleted = 0; 219 220 protected int uriCount = 0; 221 222 protected Function<String, URI> stringToUri = new Function<String, URI>() { 223 @Override 224 public URI apply(String input) { 225 try { 226 return new URI(input); 227 } catch (URISyntaxException e) { 228 throw new RuntimeException(e); 229 } 230 } 231 }; 232 233 protected Function<File, URI> fileToUri = new Function<File, URI>() { 234 @Override 235 public URI apply(File input) { 236 return input.toURI(); 237 } 238 }; 239 240 @Override 241 public void initialize(UimaContext context) throws ResourceInitializationException { 242 243 // Convert list of files to URIs 244 // Iterable<File> filteredFiles = Iterables.filter(this.files, this.directoryFilesFilter); 245 this.uriCount += this.files.size(); 246 Iterable<URI> urisFromFiles = Iterables.transform(this.files, this.fileToUri); 247 248 // Read file names from directory and convert list of files to URI 249 Iterable<URI> urisFromDirectory = new ArrayList<URI>(); 250 if (this.isDirectoryValid()) { 251 IOFileFilter fileFilter; 252 IOFileFilter directoryFilter; 253 254 try { 255 fileFilter = this.fileFilterClass.newInstance(); 256 directoryFilter = this.directoryFilterClass.newInstance(); 257 } catch (InstantiationException e) { 258 throw new ResourceInitializationException(e); 259 } catch (IllegalAccessException e) { 260 throw new ResourceInitializationException(e); 261 } 262 263 Collection<File> filesInDir = FileUtils.listFiles(this.directory, fileFilter, directoryFilter); 264 urisFromDirectory = Iterables.transform(filesInDir, this.fileToUri); 265 this.uriCount += filesInDir.size(); 266 } 267 268 // Combine URI iterables from all conditions and initialize iterator 269 this.uriIter = Iterables.concat(this.uris, urisFromFiles, urisFromDirectory).iterator(); 270 } 271 272 private boolean isDirectoryValid() throws ResourceInitializationException { 273 if (this.directory == null) { 274 return false; 275 } 276 277 if (!this.directory.exists()) { 278 String format = "Directory %s does not exist"; 279 String message = String.format(format, directory.getPath()); 280 throw new ResourceInitializationException(new IOException(message)); 281 } 282 283 if (!this.directory.isDirectory()) { 284 String format = "Directory %s is not a directory. For specific files set PARAM_FILES instead of PARAM_DIRECTORY."; 285 String message = String.format(format, directory.getPath()); 286 throw new ResourceInitializationException(new IOException(message)); 287 } 288 return true; 289 } 290 291 @Override 292 public boolean hasNext() throws IOException, CollectionException { 293 return this.uriIter.hasNext(); 294 } 295 296 @Override 297 public Progress[] getProgress() { 298 Progress progress = new ProgressImpl(numUrisCompleted, uriCount, Progress.ENTITIES); 299 return new Progress[] { progress }; 300 } 301 302 @Override 303 public void getNext(JCas jCas) throws IOException, CollectionException { 304 if (!this.hasNext()) { 305 throw new RuntimeException("getNext(jCas) was called but hasNext() returns false"); 306 } 307 308 ViewUriUtil.setURI(jCas, this.uriIter.next()); 309 } 310 311}