001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.util.cr.linereader; 025 026import java.io.BufferedReader; 027import java.io.File; 028import java.io.FileInputStream; 029import java.io.FileNotFoundException; 030import java.io.IOException; 031import java.io.InputStreamReader; 032import java.io.UnsupportedEncodingException; 033import java.util.Arrays; 034import java.util.Iterator; 035 036import org.apache.commons.io.FileUtils; 037import org.apache.commons.io.filefilter.SuffixFileFilter; 038import org.apache.commons.io.filefilter.TrueFileFilter; 039import org.apache.uima.UimaContext; 040import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 041import org.apache.uima.cas.CAS; 042import org.apache.uima.collection.CollectionException; 043import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; 044import org.apache.uima.fit.component.ViewCreatorAnnotator; 045import org.apache.uima.fit.descriptor.ConfigurationParameter; 046import org.apache.uima.fit.descriptor.SofaCapability; 047import org.apache.uima.fit.factory.initializable.InitializableFactory; 048import org.apache.uima.jcas.JCas; 049import org.apache.uima.resource.ResourceInitializationException; 050import org.apache.uima.util.Progress; 051import org.apache.uima.util.ProgressImpl; 052import org.cleartk.util.ViewUriUtil; 053 054/** 055 * <br> 056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 057 * All rights reserved. 058 * <p> 059 * 060 * LineReader is collection reader for cases when you want to read in files line-by-line such that 061 * there is one JCas per line. 062 * 063 * <p> 064 * This class has no relation to LineWriter - i.e. LineReader does not provide 065 * "reverse functionality" of LineWriter. 066 * 067 * <p> 068 * This class is very similar to PlainTextCollectionReader in that it allows you to specify a file 069 * or directory from which to read in plain text into a named view with a specified language and 070 * encoding. However, instead of reading in entire files as plain text, this collection reader reads 071 * in a file line-by-line where each line gets its own JCas. 072 * <p> 073 * LineReader uses an interface LineHandler which determines how lines from a file are used to 074 * initialize a JCas. The default implementation, DefaultLineHandler, simply expects each line to be 075 * plain text and the id of the document will be the number of lines read up to that point (across 076 * all files that are being read in.) A second implementation, SimpleLineHandler, assumes that an id 077 * for each line is provided in the text of the line and parses it out. 078 * 079 * @author Steven Bethard 080 * @author Philip Ogren 081 */ 082@SofaCapability(outputSofas = ViewUriUtil.URI) 083public class LineReader extends JCasCollectionReader_ImplBase { 084 085 public static final String PARAM_FILE_OR_DIRECTORY_NAME = "fileOrDirectoryName"; 086 087 @ConfigurationParameter( 088 name = PARAM_FILE_OR_DIRECTORY_NAME, 089 mandatory = true, 090 description = "Takes either the name of a single file or the root directory containing all the files to be processed.") 091 private String fileOrDirectoryName; 092 093 public static final String PARAM_VIEW_NAME = "viewName"; 094 095 @ConfigurationParameter( 096 name = PARAM_VIEW_NAME, 097 mandatory = false, 098 description = "takes the the name that should be given to the JCas view associated with the document texts.", 099 defaultValue = CAS.NAME_DEFAULT_SOFA) 100 private String viewName; 101 102 public static final String PARAM_LANGUAGE = "language"; 103 104 @ConfigurationParameter( 105 name = PARAM_LANGUAGE, 106 mandatory = false, 107 description = "takes the language code corresponding to the language of the documents being examined. The value of this parameter is simply passed on to JCas.setDocumentLanguage(String)") 108 private String language; 109 110 public static final String PARAM_ENCODING = "encoding"; 111 112 @ConfigurationParameter( 113 name = PARAM_ENCODING, 114 mandatory = false, 115 description = "takes the encoding of the text files (e.g. 'UTF-8'). See apidocs for java.nio.charset.Charset for a list of encoding names.") 116 private String encoding; 117 118 public static final String PARAM_SUFFIXES = "suffixes"; 119 120 @ConfigurationParameter( 121 name = PARAM_SUFFIXES, 122 mandatory = false, 123 description = "Takes suffixes (e.g. .txt) of the files that should be read in.") 124 private String[] suffixes; 125 126 public static final String PARAM_LINE_HANDLER_CLASS_NAME = "lineHandlerClassName"; 127 128 @ConfigurationParameter( 129 name = PARAM_LINE_HANDLER_CLASS_NAME, 130 mandatory = false, 131 description = "specifies the class name of the LineHandler. If one is not specified, then the DefaultLineHandler will be used.", 132 defaultValue = "org.cleartk.util.cr.linereader.DefaultLineHandler") 133 private String lineHandlerClassName; 134 135 public static final String PARAM_COMMENT_SPECIFIERS = "commentSpecifiers"; 136 137 @ConfigurationParameter( 138 name = PARAM_COMMENT_SPECIFIERS, 139 mandatory = false, 140 description = "Specifies lines that should be considered 'comments' - i.e. lines that should be skipped. Commented lines are those the start with one of the values of this parameter.") 141 private String[] commentSpecifiers; 142 143 public static final String PARAM_SKIP_BLANK_LINES = "skipBlankLines"; 144 145 @ConfigurationParameter( 146 name = PARAM_SKIP_BLANK_LINES, 147 mandatory = false, 148 description = "Specifies whether blank lines should be skipped or not. The default value is true if no value is given. If this parameter is set to false, then blank lines that appear in the text files will be read in and given their own JCas. Blank lines are those that consist of only whitespace.", 149 defaultValue = "true") 150 private boolean skipBlankLines; 151 152 File file; 153 154 int lineNumber; 155 156 String line; 157 158 BufferedReader input; 159 160 LineHandler lineHandler; 161 162 @Override 163 public void initialize(UimaContext context) throws ResourceInitializationException { 164 try { 165 this.rootFile = new File(fileOrDirectoryName); 166 167 // raise an exception if the root file does not exist 168 if (!this.rootFile.exists()) { 169 String format = "file or directory %s does not exist"; 170 String message = String.format(format, fileOrDirectoryName); 171 throw new ResourceInitializationException(new IOException(message)); 172 } 173 174 if (rootFile.isDirectory()) { 175 if (suffixes != null && suffixes.length > 0) { 176 files = FileUtils.iterateFiles( 177 rootFile, 178 new SuffixFileFilter(suffixes), 179 TrueFileFilter.INSTANCE); 180 } else { 181 files = FileUtils.iterateFiles(rootFile, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); 182 } 183 } else { 184 files = Arrays.asList(rootFile).iterator(); 185 } 186 if (commentSpecifiers == null) { 187 commentSpecifiers = new String[0]; 188 } 189 190 lineHandler = InitializableFactory.create( 191 getUimaContext(), 192 lineHandlerClassName, 193 LineHandler.class); 194 moveToNextFile(); 195 } catch (Exception fnfe) { 196 throw new ResourceInitializationException(fnfe); 197 } 198 } 199 200 public void getNext(JCas jCas) throws IOException, CollectionException { 201 hasNext(); 202 203 JCas view; 204 try { 205 view = ViewCreatorAnnotator.createViewSafely(jCas, this.viewName); 206 } catch (AnalysisEngineProcessException e) { 207 throw new CollectionException(e); 208 } 209 210 lineHandler.handleLine(view, rootFile, file, line); 211 212 // set language if it was specified 213 if (this.language != null) { 214 view.setDocumentLanguage(this.language); 215 } 216 217 completed++; 218 line = null; 219 } 220 221 private boolean moveToNextFile() throws FileNotFoundException, UnsupportedEncodingException { 222 if (files.hasNext()) { 223 file = (File) files.next(); 224 if (encoding != null) 225 input = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)); 226 else 227 input = new BufferedReader(new InputStreamReader(new FileInputStream(file))); 228 229 lineNumber = 0; 230 return true; 231 } 232 return false; 233 } 234 235 public Progress[] getProgress() { 236 Progress progress = new ProgressImpl(completed, 1000000, Progress.ENTITIES); 237 return new Progress[] { progress }; 238 } 239 240 public boolean hasNext() throws IOException, CollectionException { 241 if (line == null) { 242 line = input.readLine(); 243 if (line != null) { 244 for (String commentSpecifier : commentSpecifiers) { 245 if (line.startsWith(commentSpecifier)) { 246 line = null; 247 return hasNext(); 248 } 249 } 250 if (skipBlankLines && line.trim().equals("")) { 251 line = null; 252 return hasNext(); 253 } 254 } 255 } 256 257 if (line == null) { 258 if (moveToNextFile()) 259 return hasNext(); 260 else 261 return false; 262 } 263 return true; 264 } 265 266 private File rootFile; 267 268 private Iterator<?> files; 269 270 private int completed = 0; 271 272 public void close() throws IOException { 273 } 274 275}