001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024 025package org.cleartk.util.ae.linewriter; 026 027import java.io.File; 028import java.io.FileNotFoundException; 029import java.io.PrintStream; 030 031import org.apache.uima.UimaContext; 032import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 033import org.apache.uima.cas.FSIterator; 034import org.apache.uima.cas.Type; 035import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 036import org.apache.uima.fit.descriptor.ConfigurationParameter; 037import org.apache.uima.fit.factory.initializable.InitializableFactory; 038import org.apache.uima.fit.util.JCasUtil; 039import org.apache.uima.jcas.JCas; 040import org.apache.uima.jcas.tcas.Annotation; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.cleartk.util.CleartkInitializationException; 043import org.cleartk.util.ReflectionUtil; 044import org.cleartk.util.ViewUriUtil; 045 046/** 047 * <br> 048 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 049 * All rights reserved. 050 * 051 * <p> 052 * 053 * This writer provides a way to write out annotations one-per-line to a plain text file in a 054 * variety of ways configurable at run time. 055 * 056 * <p> 057 * This class has no relation to LineReader - i.e. LineWriter does not provide 058 * "reverse functionality" of LineReader. 059 * 060 * <p> 061 * If you mistook this class for a line rider, then please redirect to the completely unrelated, but 062 * totally awesome Line Rider at: http://linerider.com 063 * 064 * @author Philip Ogren 065 */ 066 067public class LineWriter<ANNOTATION_TYPE extends Annotation, BLOCK_TYPE extends Annotation> extends 068 JCasAnnotator_ImplBase { 069 070 private static final String OUTPUT_DIRECTORY_NAME_DESCRIPTION = "takes a " 071 + "path to directory into which output files will be written. If no value is " 072 + "given for this parameter, then the parameter 'outputFileName' " 073 + "is required. If a value is given, then one file for each document/JCas will be created in " 074 + "the output directory provided. The name of each file will be given by the value returned by ViewURIUtil.getURI(jCas). " 075 + "If a value for both 'outputDirectoryName' and ' " 076 + "outputFileName' is given, then an exception will be thrown. Example values " 077 + "that could be provided might look like: \n\n" 078 + "\t/mydata/uima-output/\n" 079 + "\tC:/Documents and Settings/User/My Documents/workspace/My Project/data/experiment/output\n"; 080 081 public static final String PARAM_OUTPUT_DIRECTORY_NAME = "outputDirectoryName"; 082 083 @ConfigurationParameter( 084 description = OUTPUT_DIRECTORY_NAME_DESCRIPTION, 085 name = PARAM_OUTPUT_DIRECTORY_NAME, 086 mandatory = false) 087 private String outputDirectoryName; 088 089 private static final String FILE_SUFFIX_DESCRIPTION = "provides a file " 090 + "name suffix for each file generated by this writer. If there is no value " 091 + "given for the parameter 'outputDirectoryName', then this parameter is " 092 + "ignored. If 'outputDirectoryName' is given a value, then the generated files " 093 + "will be named by the document ids and the suffix provided by this parameter. If no value for this " 094 + "parameter is given, then the files will be named the same as the document id. Example values " 095 + "that could be provided might include: \n\n" + ".txt\n" + ".tokens\n" + ".annotations.txt"; 096 097 public static final String PARAM_FILE_SUFFIX = "fileSuffix"; 098 099 @ConfigurationParameter( 100 description = FILE_SUFFIX_DESCRIPTION, 101 name = PARAM_FILE_SUFFIX, 102 mandatory = false) 103 private String fileSuffix; 104 105 private static final String OUTPUT_FILE_NAME_DESCRIPTION = "takes a file " 106 + "name to write results to. If no value is given for this parameter, then " 107 + "the parameter 'outputDirectoryName' is required. " 108 + "If a value is given, then one file for all documents will be created in the " 109 + "output directory provided. If a value for both 'outputDirectoryName'" 110 + " and 'outputFileName' is given, then an exception will be thrown. " 111 + "Example values that could be provided might look like: \n\n" 112 + "/mydata/uima-output/annotations.txt\n" 113 + "C:\\Documents and Settings\\User\\My Documents\\workspace\\My Project\\data\\experiment\\output\\output.annotations\n"; 114 115 public static final String PARAM_OUTPUT_FILE_NAME = "outputFileName"; 116 117 @ConfigurationParameter( 118 description = OUTPUT_FILE_NAME_DESCRIPTION, 119 name = PARAM_OUTPUT_FILE_NAME, 120 mandatory = false) 121 private String outputFileName; 122 123 private static final String OUTPUT_ANNOTATION_CLASS_NAME_DESCRIPTION = "takes the name of the annotation class of the annotations that are to be " 124 + "written out. The annotation class must be a subclass of org.apache.uima.jcas.tcas.Annotation. " 125 + "The manner in which annotations are written out is determined by the AnnotationWriter as " 126 + "described below. The AnnotationWriter interface is generically typed. The class specified by this " 127 + "parameter must be the same as or a subclass of the type specified by the implementation of " 128 + "AnnotationWriter. Example values that could be provided might include:\n\n" 129 + "org.apache.uima.jcas.tcas.Annotation (default)\n" 130 + "org.cleartk.type.Token\n" 131 + "org.cleartk.type.Sentence\n" + "com.yourcompany.yourpackage.YourType"; 132 133 public static final String PARAM_OUTPUT_ANNOTATION_CLASS_NAME = "outputAnnotationClassName"; 134 135 @ConfigurationParameter( 136 name = PARAM_OUTPUT_ANNOTATION_CLASS_NAME, 137 mandatory = true, 138 description = OUTPUT_ANNOTATION_CLASS_NAME_DESCRIPTION, 139 defaultValue = "org.apache.uima.jcas.tcas.Annotation") 140 private static String outputAnnotationClassName; 141 142 private static final String ANNOTATION_WRITER_CLASS_NAME_DESCRIPTION = "provides the class name of a class that extends org.cleartk.util.linewriter.AnnotationWriter. " 143 + "The AnnotationWriter determines how annotations will be written. For example, " 144 + "CoveredTextAnnotationWriter simply writes out the covered text of an annotation. " 145 + "Example values that could be provided might include:\n\n" 146 + "org.cleartk.util.linewriter.annotation.CoveredTextAnnotationWriter (default)\n" 147 + "org.cleartk.util.linewriter.annotation.TokenPOSWriter\n"; 148 149 public static final String PARAM_ANNOTATION_WRITER_CLASS_NAME = "annotationWriterClassName"; 150 151 @ConfigurationParameter( 152 name = PARAM_ANNOTATION_WRITER_CLASS_NAME, 153 mandatory = true, 154 description = ANNOTATION_WRITER_CLASS_NAME_DESCRIPTION, 155 defaultValue = "org.cleartk.util.ae.linewriter.annotation.CoveredTextAnnotationWriter") 156 private String annotationWriterClassName; 157 158 private static final String BLOCK_ANNOTATION_CLASS_NAME_DESCRIPTION = "Takes the name of an annotation class that determines a 'block' of lines in the " 159 + "resulting output file(s). Each 'block' of lines is separated by some text " 160 + "(such as a newline) as determined by the BlockWriter specified as " 161 + "described below. If, for example, the value of 'outputAnnotationClassName' is " 162 + "'org.cleartk.type.Token' and the value for 'blockAnnotationClassName' is " 163 + "'org.cleartk.type.Sentence' and the value for 'blockWriterClassName' is " 164 + "'org.cleartk.util.linewriter.block.BlankLineBlockWriter' (the default), then the tokens in each sentence " 165 + "will be written out one per line with a blank line between the last token of a sentence and the first " 166 + "token of the following sentence. Note that setting this parameter may limit the number of annotations " 167 + "that are written out if, for example, not all tokens are found inside sentences. If no value is given, then " 168 + "there will be no blank lines in the resulting file (assuming the AnnotationWriter does not produce a " 169 + "blank line). If you want there to be a blank line between each document (assuming 'outputFileName' " 170 + " is given a parameter), then this parameter should be given the value 'org.apache.uima.jcas.tcas.DocumentAnnotation'. " 171 + "Example values that could be provided might include: \n\n" 172 + "org.cleartk.type.Sentence\n" 173 + "org.apache.uima.jcas.tcas.DocumentAnnotation\n" + "com.yourcompany.yourpackage.YourType\n"; 174 175 public static final String PARAM_BLOCK_ANNOTATION_CLASS_NAME = "blockAnnotationClassName"; 176 177 @ConfigurationParameter( 178 description = BLOCK_ANNOTATION_CLASS_NAME_DESCRIPTION, 179 name = PARAM_BLOCK_ANNOTATION_CLASS_NAME, 180 mandatory = false) 181 private String blockAnnotationClassName; 182 183 private final static String BLOCK_WRITER_CLASS_NAME_DESCRIPTION = "Provides the class name of a class that extends org.cleartk.util.linewriter.BlockWriter. " 184 + "The BlockWriter determines how blocks of annotations will be delimited. For example, " 185 + "org.cleartk.util.linewriter.block.BlankLineBlockWriter simply writes out a blank line between each " 186 + "block of annotations. Example values that could be provided might include: \n\n" 187 + "org.cleartk.util.linewriter.block.BlankLineBlockWriter\n" 188 + "org.cleartk.util.linewriter.block.DocumentIdBlockWriter\n"; 189 190 public static final String PARAM_BLOCK_WRITER_CLASS_NAME = "blockWriterClassName"; 191 192 @ConfigurationParameter( 193 name = PARAM_BLOCK_WRITER_CLASS_NAME, 194 description = BLOCK_WRITER_CLASS_NAME_DESCRIPTION, 195 mandatory = false, 196 defaultValue = "org.cleartk.util.ae.linewriter.block.BlankLineBlockWriter") 197 private String blockWriterClassName; 198 199 private File outputDirectory; 200 201 private File outputFile; 202 203 private Class<? extends Annotation> outputAnnotationClass; 204 205 private Type outputAnnotationType; 206 207 private Class<? extends Annotation> blockAnnotationClass; 208 209 private Type blockAnnotationType; 210 211 boolean blockOnDocument = false; 212 213 AnnotationWriter<ANNOTATION_TYPE> annotationWriter; 214 215 BlockWriter<BLOCK_TYPE> blockWriter; 216 217 PrintStream out; 218 219 private boolean typesInitialized = false; 220 221 @Override 222 public void initialize(UimaContext context) throws ResourceInitializationException { 223 try { 224 super.initialize(context); 225 226 if ((outputDirectoryName == null && outputFileName == null) 227 || (outputDirectoryName != null && outputFileName != null)) { 228 throw CleartkInitializationException.notExactlyOneParameterSet( 229 PARAM_OUTPUT_DIRECTORY_NAME, 230 outputDirectoryName, 231 PARAM_OUTPUT_FILE_NAME, 232 outputFileName); 233 } 234 235 if (outputDirectoryName != null) { 236 outputDirectory = new File(outputDirectoryName); 237 if (!this.outputDirectory.exists()) { 238 this.outputDirectory.mkdirs(); 239 } 240 } 241 242 if (outputFileName != null) { 243 outputFile = new File(outputFileName); 244 if (!outputFile.getParentFile().exists()) { 245 outputFile.getParentFile().mkdirs(); 246 } 247 out = new PrintStream(outputFile); 248 } 249 250 outputAnnotationClass = InitializableFactory.getClass( 251 outputAnnotationClassName, 252 Annotation.class); 253 254 Class<? extends AnnotationWriter<ANNOTATION_TYPE>> annotationWriterClass = ReflectionUtil.uncheckedCast(Class.forName( 255 annotationWriterClassName).asSubclass(AnnotationWriter.class)); 256 annotationWriter = InitializableFactory.create( 257 context, 258 annotationWriterClassName, 259 annotationWriterClass); 260 261 java.lang.reflect.Type annotationType = ReflectionUtil.getTypeArgument( 262 AnnotationWriter.class, 263 "ANNOTATION_TYPE", 264 this.annotationWriter); 265 266 if (!ReflectionUtil.isAssignableFrom(annotationType, outputAnnotationClass)) { 267 throw CleartkInitializationException.incompatibleTypeParameterAndType( 268 this.annotationWriter, 269 "ANNOTATION_TYPE", 270 annotationType, 271 outputAnnotationClass); 272 } 273 274 if (blockAnnotationClassName != null) { 275 276 Class<? extends BlockWriter<BLOCK_TYPE>> blockWriterClass = ReflectionUtil.uncheckedCast(Class.forName( 277 blockWriterClassName).asSubclass(BlockWriter.class)); 278 this.blockWriter = InitializableFactory.create( 279 context, 280 blockWriterClassName, 281 blockWriterClass); 282 283 if (blockAnnotationClassName.equals("org.apache.uima.jcas.tcas.DocumentAnnotation")) { 284 blockOnDocument = true; 285 } else { 286 blockAnnotationClass = Class.forName(blockAnnotationClassName).asSubclass( 287 Annotation.class); 288 289 java.lang.reflect.Type blockType = ReflectionUtil.getTypeArgument( 290 BlockWriter.class, 291 "BLOCK_TYPE", 292 this.blockWriter); 293 294 if (!ReflectionUtil.isAssignableFrom(blockType, blockAnnotationClass)) { 295 throw CleartkInitializationException.incompatibleTypeParameterAndType( 296 this.blockWriter, 297 "BLOCK_TYPE", 298 blockType, 299 blockAnnotationClass); 300 } 301 } 302 } 303 304 if (fileSuffix == null) { 305 fileSuffix = ""; 306 } else if (!fileSuffix.startsWith(".")) { 307 fileSuffix = "." + fileSuffix; 308 } 309 } catch (Exception e) { 310 throw new ResourceInitializationException(e); 311 } 312 313 } 314 315 private void initializeTypes(JCas jCas) throws AnalysisEngineProcessException { 316 try { 317 outputAnnotationType = JCasUtil.getType(jCas, outputAnnotationClass); 318 if (blockAnnotationClass != null) { 319 blockAnnotationType = JCasUtil.getType(jCas, blockAnnotationClass); 320 } 321 } catch (Exception e) { 322 throw new AnalysisEngineProcessException(e); 323 } 324 typesInitialized = true; 325 } 326 327 @SuppressWarnings("unchecked") 328 @Override 329 public void process(JCas jCas) throws AnalysisEngineProcessException { 330 if (!typesInitialized) 331 initializeTypes(jCas); 332 333 try { 334 if (outputDirectory != null) { 335 String id = (new File(ViewUriUtil.getURI(jCas))).getName(); 336 while (id.endsWith(".")) { 337 id = id.substring(0, id.length() - 1); 338 } 339 out = new PrintStream(new File(outputDirectory, id + fileSuffix)); 340 } 341 342 if (blockOnDocument) { 343 BLOCK_TYPE documentAnnotation = (BLOCK_TYPE) jCas.getDocumentAnnotationFs(); 344 out.print(blockWriter.writeBlock(jCas, documentAnnotation)); 345 FSIterator<Annotation> outputAnnotations = jCas.getAnnotationIndex(outputAnnotationType).iterator(); 346 while (outputAnnotations.hasNext()) { 347 ANNOTATION_TYPE outputAnnotation = (ANNOTATION_TYPE) outputAnnotations.next(); 348 out.println(annotationWriter.writeAnnotation(jCas, outputAnnotation)); 349 } 350 } else if (blockAnnotationType != null) { 351 for (Annotation block : JCasUtil.select(jCas, blockAnnotationClass)) { 352 BLOCK_TYPE blockAnnotation = (BLOCK_TYPE) block; 353 out.print(blockWriter.writeBlock(jCas, blockAnnotation)); 354 for (Annotation output : JCasUtil.selectCovered(outputAnnotationClass, blockAnnotation)) { 355 ANNOTATION_TYPE outputAnnotation = (ANNOTATION_TYPE) output; 356 out.println(annotationWriter.writeAnnotation(jCas, outputAnnotation)); 357 } 358 } 359 } 360 361 else { 362 FSIterator<Annotation> outputAnnotations = jCas.getAnnotationIndex(outputAnnotationType).iterator(); 363 while (outputAnnotations.hasNext()) { 364 ANNOTATION_TYPE outputAnnotation = (ANNOTATION_TYPE) outputAnnotations.next(); 365 out.println(annotationWriter.writeAnnotation(jCas, outputAnnotation)); 366 } 367 } 368 369 if (outputDirectory != null) { 370 out.flush(); 371 out.close(); 372 } 373 } catch (FileNotFoundException fnfe) { 374 throw new AnalysisEngineProcessException(fnfe); 375 } 376 } 377 378 @Override 379 public void collectionProcessComplete() throws AnalysisEngineProcessException { 380 if (outputFile != null) { 381 out.flush(); 382 out.close(); 383 } 384 // TODO Auto-generated method stub 385 super.collectionProcessComplete(); 386 } 387 388}