001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024
025package org.cleartk.util.ae.linewriter;
026
027import java.io.File;
028import java.io.FileNotFoundException;
029import java.io.PrintStream;
030
031import org.apache.uima.UimaContext;
032import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
033import org.apache.uima.cas.FSIterator;
034import org.apache.uima.cas.Type;
035import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
036import org.apache.uima.fit.descriptor.ConfigurationParameter;
037import org.apache.uima.fit.factory.initializable.InitializableFactory;
038import org.apache.uima.fit.util.JCasUtil;
039import org.apache.uima.jcas.JCas;
040import org.apache.uima.jcas.tcas.Annotation;
041import org.apache.uima.resource.ResourceInitializationException;
042import org.cleartk.util.CleartkInitializationException;
043import org.cleartk.util.ReflectionUtil;
044import org.cleartk.util.ViewUriUtil;
045
046/**
047 * <br>
048 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
049 * All rights reserved.
050 * 
051 * <p>
052 * 
053 * This writer provides a way to write out annotations one-per-line to a plain text file in a
054 * variety of ways configurable at run time.
055 * 
056 * <p>
057 * This class has no relation to LineReader - i.e. LineWriter does not provide
058 * "reverse functionality" of LineReader.
059 * 
060 * <p>
061 * If you mistook this class for a line rider, then please redirect to the completely unrelated, but
062 * totally awesome Line Rider at: http://linerider.com
063 * 
064 * @author Philip Ogren
065 */
066
067public class LineWriter<ANNOTATION_TYPE extends Annotation, BLOCK_TYPE extends Annotation> extends
068    JCasAnnotator_ImplBase {
069
070  private static final String OUTPUT_DIRECTORY_NAME_DESCRIPTION = "takes a "
071      + "path to directory into which output files will be written. If no value is "
072      + "given for this parameter, then the parameter 'outputFileName' "
073      + "is required. If a value is given, then one file for each document/JCas will be created in "
074      + "the output directory provided.   The name of each file will be given by the value returned by ViewURIUtil.getURI(jCas). "
075      + "If a value for both 'outputDirectoryName' and ' "
076      + "outputFileName'  is given, then an exception will be thrown. Example values "
077      + "that could be provided might look like: \n\n"
078      + "\t/mydata/uima-output/\n"
079      + "\tC:/Documents and Settings/User/My Documents/workspace/My Project/data/experiment/output\n";
080
081  public static final String PARAM_OUTPUT_DIRECTORY_NAME = "outputDirectoryName";
082
083  @ConfigurationParameter(
084      description = OUTPUT_DIRECTORY_NAME_DESCRIPTION,
085      name = PARAM_OUTPUT_DIRECTORY_NAME,
086      mandatory = false)
087  private String outputDirectoryName;
088
089  private static final String FILE_SUFFIX_DESCRIPTION = "provides a file "
090      + "name suffix for each file generated by this writer.  If there is no value "
091      + "given for the parameter 'outputDirectoryName', then this parameter is "
092      + "ignored. If 'outputDirectoryName' is given a value, then the generated files "
093      + "will be named by the document ids and the suffix provided by this  parameter. If no value for this "
094      + "parameter is given, then the files will be named the same as the document id. Example values "
095      + "that could be provided might include: \n\n" + ".txt\n" + ".tokens\n" + ".annotations.txt";
096
097  public static final String PARAM_FILE_SUFFIX = "fileSuffix";
098
099  @ConfigurationParameter(
100      description = FILE_SUFFIX_DESCRIPTION,
101      name = PARAM_FILE_SUFFIX,
102      mandatory = false)
103  private String fileSuffix;
104
105  private static final String OUTPUT_FILE_NAME_DESCRIPTION = "takes a file "
106      + "name to write results to.  If no value is given for this parameter, then "
107      + "the parameter 'outputDirectoryName' is required.  "
108      + "If a value is given, then one file for all documents will be created in the "
109      + "output directory provided. If a value for both 'outputDirectoryName'"
110      + " and 'outputFileName' is given, then an exception will be thrown. "
111      + "Example values that could be provided might look like: \n\n"
112      + "/mydata/uima-output/annotations.txt\n"
113      + "C:\\Documents and Settings\\User\\My Documents\\workspace\\My Project\\data\\experiment\\output\\output.annotations\n";
114
115  public static final String PARAM_OUTPUT_FILE_NAME = "outputFileName";
116
117  @ConfigurationParameter(
118      description = OUTPUT_FILE_NAME_DESCRIPTION,
119      name = PARAM_OUTPUT_FILE_NAME,
120      mandatory = false)
121  private String outputFileName;
122
123  private static final String OUTPUT_ANNOTATION_CLASS_NAME_DESCRIPTION = "takes the name of the annotation class of the annotations that are to be "
124      + "written out. The annotation class must be a subclass of org.apache.uima.jcas.tcas.Annotation. "
125      + "The manner in which annotations are written out is determined by the AnnotationWriter as "
126      + "described below. The AnnotationWriter interface is generically typed. The class specified by this "
127      + "parameter must be the same as or a subclass of the type specified by the implementation of "
128      + "AnnotationWriter. Example values that could be provided might include:\n\n"
129      + "org.apache.uima.jcas.tcas.Annotation (default)\n"
130      + "org.cleartk.type.Token\n"
131      + "org.cleartk.type.Sentence\n" + "com.yourcompany.yourpackage.YourType";
132
133  public static final String PARAM_OUTPUT_ANNOTATION_CLASS_NAME = "outputAnnotationClassName";
134
135  @ConfigurationParameter(
136      name = PARAM_OUTPUT_ANNOTATION_CLASS_NAME,
137      mandatory = true,
138      description = OUTPUT_ANNOTATION_CLASS_NAME_DESCRIPTION,
139      defaultValue = "org.apache.uima.jcas.tcas.Annotation")
140  private static String outputAnnotationClassName;
141
142  private static final String ANNOTATION_WRITER_CLASS_NAME_DESCRIPTION = "provides the class name of a class that extends org.cleartk.util.linewriter.AnnotationWriter. "
143      + "The AnnotationWriter determines how annotations will be written. For example, "
144      + "CoveredTextAnnotationWriter simply writes out the covered text of an annotation. "
145      + "Example values that could be provided might include:\n\n"
146      + "org.cleartk.util.linewriter.annotation.CoveredTextAnnotationWriter (default)\n"
147      + "org.cleartk.util.linewriter.annotation.TokenPOSWriter\n";
148
149  public static final String PARAM_ANNOTATION_WRITER_CLASS_NAME = "annotationWriterClassName";
150
151  @ConfigurationParameter(
152      name = PARAM_ANNOTATION_WRITER_CLASS_NAME,
153      mandatory = true,
154      description = ANNOTATION_WRITER_CLASS_NAME_DESCRIPTION,
155      defaultValue = "org.cleartk.util.ae.linewriter.annotation.CoveredTextAnnotationWriter")
156  private String annotationWriterClassName;
157
158  private static final String BLOCK_ANNOTATION_CLASS_NAME_DESCRIPTION = "Takes the name of an annotation class that determines a 'block' of lines in the "
159      + "resulting output file(s). Each 'block' of lines is separated by some text "
160      + "(such as a newline) as determined by the BlockWriter specified as "
161      + "described below. If, for example, the value of 'outputAnnotationClassName' is "
162      + "'org.cleartk.type.Token' and the value for 'blockAnnotationClassName' is "
163      + "'org.cleartk.type.Sentence' and the value for 'blockWriterClassName'  is "
164      + "'org.cleartk.util.linewriter.block.BlankLineBlockWriter' (the default), then the tokens in each sentence "
165      + "will be written out one per line with a blank line between the last token of a sentence and the first "
166      + "token of the following sentence. Note that setting this parameter may limit the number of annotations "
167      + "that are written out if, for example, not all tokens are found inside sentences.  If no value is given, then "
168      + "there will be no blank lines in the resulting file (assuming the AnnotationWriter does not produce a "
169      + "blank line). If you want there to be a blank line between each document (assuming 'outputFileName' "
170      + " is given a parameter), then this parameter should be given the value 'org.apache.uima.jcas.tcas.DocumentAnnotation'. "
171      + "Example values that could be provided might include: \n\n"
172      + "org.cleartk.type.Sentence\n"
173      + "org.apache.uima.jcas.tcas.DocumentAnnotation\n" + "com.yourcompany.yourpackage.YourType\n";
174
175  public static final String PARAM_BLOCK_ANNOTATION_CLASS_NAME = "blockAnnotationClassName";
176
177  @ConfigurationParameter(
178      description = BLOCK_ANNOTATION_CLASS_NAME_DESCRIPTION,
179      name = PARAM_BLOCK_ANNOTATION_CLASS_NAME,
180      mandatory = false)
181  private String blockAnnotationClassName;
182
183  private final static String BLOCK_WRITER_CLASS_NAME_DESCRIPTION = "Provides  the class name of a class that extends org.cleartk.util.linewriter.BlockWriter. "
184      + "The BlockWriter determines how blocks of annotations will be delimited. For example, "
185      + "org.cleartk.util.linewriter.block.BlankLineBlockWriter simply writes out a blank line between each "
186      + "block of annotations.  Example values that could be provided might include: \n\n"
187      + "org.cleartk.util.linewriter.block.BlankLineBlockWriter\n"
188      + "org.cleartk.util.linewriter.block.DocumentIdBlockWriter\n";
189
190  public static final String PARAM_BLOCK_WRITER_CLASS_NAME = "blockWriterClassName";
191
192  @ConfigurationParameter(
193      name = PARAM_BLOCK_WRITER_CLASS_NAME,
194      description = BLOCK_WRITER_CLASS_NAME_DESCRIPTION,
195      mandatory = false,
196      defaultValue = "org.cleartk.util.ae.linewriter.block.BlankLineBlockWriter")
197  private String blockWriterClassName;
198
199  private File outputDirectory;
200
201  private File outputFile;
202
203  private Class<? extends Annotation> outputAnnotationClass;
204
205  private Type outputAnnotationType;
206
207  private Class<? extends Annotation> blockAnnotationClass;
208
209  private Type blockAnnotationType;
210
211  boolean blockOnDocument = false;
212
213  AnnotationWriter<ANNOTATION_TYPE> annotationWriter;
214
215  BlockWriter<BLOCK_TYPE> blockWriter;
216
217  PrintStream out;
218
219  private boolean typesInitialized = false;
220
221  @Override
222  public void initialize(UimaContext context) throws ResourceInitializationException {
223    try {
224      super.initialize(context);
225
226      if ((outputDirectoryName == null && outputFileName == null)
227          || (outputDirectoryName != null && outputFileName != null)) {
228        throw CleartkInitializationException.notExactlyOneParameterSet(
229            PARAM_OUTPUT_DIRECTORY_NAME,
230            outputDirectoryName,
231            PARAM_OUTPUT_FILE_NAME,
232            outputFileName);
233      }
234
235      if (outputDirectoryName != null) {
236        outputDirectory = new File(outputDirectoryName);
237        if (!this.outputDirectory.exists()) {
238          this.outputDirectory.mkdirs();
239        }
240      }
241
242      if (outputFileName != null) {
243        outputFile = new File(outputFileName);
244        if (!outputFile.getParentFile().exists()) {
245          outputFile.getParentFile().mkdirs();
246        }
247        out = new PrintStream(outputFile);
248      }
249
250      outputAnnotationClass = InitializableFactory.getClass(
251          outputAnnotationClassName,
252          Annotation.class);
253
254      Class<? extends AnnotationWriter<ANNOTATION_TYPE>> annotationWriterClass = ReflectionUtil.uncheckedCast(Class.forName(
255          annotationWriterClassName).asSubclass(AnnotationWriter.class));
256      annotationWriter = InitializableFactory.create(
257          context,
258          annotationWriterClassName,
259          annotationWriterClass);
260
261      java.lang.reflect.Type annotationType = ReflectionUtil.getTypeArgument(
262          AnnotationWriter.class,
263          "ANNOTATION_TYPE",
264          this.annotationWriter);
265
266      if (!ReflectionUtil.isAssignableFrom(annotationType, outputAnnotationClass)) {
267        throw CleartkInitializationException.incompatibleTypeParameterAndType(
268            this.annotationWriter,
269            "ANNOTATION_TYPE",
270            annotationType,
271            outputAnnotationClass);
272      }
273
274      if (blockAnnotationClassName != null) {
275
276        Class<? extends BlockWriter<BLOCK_TYPE>> blockWriterClass = ReflectionUtil.uncheckedCast(Class.forName(
277            blockWriterClassName).asSubclass(BlockWriter.class));
278        this.blockWriter = InitializableFactory.create(
279            context,
280            blockWriterClassName,
281            blockWriterClass);
282
283        if (blockAnnotationClassName.equals("org.apache.uima.jcas.tcas.DocumentAnnotation")) {
284          blockOnDocument = true;
285        } else {
286          blockAnnotationClass = Class.forName(blockAnnotationClassName).asSubclass(
287              Annotation.class);
288
289          java.lang.reflect.Type blockType = ReflectionUtil.getTypeArgument(
290              BlockWriter.class,
291              "BLOCK_TYPE",
292              this.blockWriter);
293
294          if (!ReflectionUtil.isAssignableFrom(blockType, blockAnnotationClass)) {
295            throw CleartkInitializationException.incompatibleTypeParameterAndType(
296                this.blockWriter,
297                "BLOCK_TYPE",
298                blockType,
299                blockAnnotationClass);
300          }
301        }
302      }
303
304      if (fileSuffix == null) {
305        fileSuffix = "";
306      } else if (!fileSuffix.startsWith(".")) {
307        fileSuffix = "." + fileSuffix;
308      }
309    } catch (Exception e) {
310      throw new ResourceInitializationException(e);
311    }
312
313  }
314
315  private void initializeTypes(JCas jCas) throws AnalysisEngineProcessException {
316    try {
317      outputAnnotationType = JCasUtil.getType(jCas, outputAnnotationClass);
318      if (blockAnnotationClass != null) {
319        blockAnnotationType = JCasUtil.getType(jCas, blockAnnotationClass);
320      }
321    } catch (Exception e) {
322      throw new AnalysisEngineProcessException(e);
323    }
324    typesInitialized = true;
325  }
326
327  @SuppressWarnings("unchecked")
328  @Override
329  public void process(JCas jCas) throws AnalysisEngineProcessException {
330    if (!typesInitialized)
331      initializeTypes(jCas);
332
333    try {
334      if (outputDirectory != null) {
335        String id = (new File(ViewUriUtil.getURI(jCas))).getName();
336        while (id.endsWith(".")) {
337          id = id.substring(0, id.length() - 1);
338        }
339        out = new PrintStream(new File(outputDirectory, id + fileSuffix));
340      }
341
342      if (blockOnDocument) {
343        BLOCK_TYPE documentAnnotation = (BLOCK_TYPE) jCas.getDocumentAnnotationFs();
344        out.print(blockWriter.writeBlock(jCas, documentAnnotation));
345        FSIterator<Annotation> outputAnnotations = jCas.getAnnotationIndex(outputAnnotationType).iterator();
346        while (outputAnnotations.hasNext()) {
347          ANNOTATION_TYPE outputAnnotation = (ANNOTATION_TYPE) outputAnnotations.next();
348          out.println(annotationWriter.writeAnnotation(jCas, outputAnnotation));
349        }
350      } else if (blockAnnotationType != null) {
351        for (Annotation block : JCasUtil.select(jCas, blockAnnotationClass)) {
352          BLOCK_TYPE blockAnnotation = (BLOCK_TYPE) block;
353          out.print(blockWriter.writeBlock(jCas, blockAnnotation));
354          for (Annotation output : JCasUtil.selectCovered(outputAnnotationClass, blockAnnotation)) {
355            ANNOTATION_TYPE outputAnnotation = (ANNOTATION_TYPE) output;
356            out.println(annotationWriter.writeAnnotation(jCas, outputAnnotation));
357          }
358        }
359      }
360
361      else {
362        FSIterator<Annotation> outputAnnotations = jCas.getAnnotationIndex(outputAnnotationType).iterator();
363        while (outputAnnotations.hasNext()) {
364          ANNOTATION_TYPE outputAnnotation = (ANNOTATION_TYPE) outputAnnotations.next();
365          out.println(annotationWriter.writeAnnotation(jCas, outputAnnotation));
366        }
367      }
368
369      if (outputDirectory != null) {
370        out.flush();
371        out.close();
372      }
373    } catch (FileNotFoundException fnfe) {
374      throw new AnalysisEngineProcessException(fnfe);
375    }
376  }
377
378  @Override
379  public void collectionProcessComplete() throws AnalysisEngineProcessException {
380    if (outputFile != null) {
381      out.flush();
382      out.close();
383    }
384    // TODO Auto-generated method stub
385    super.collectionProcessComplete();
386  }
387
388}