001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.timeml;
025
026import java.io.ByteArrayOutputStream;
027import java.io.File;
028import java.io.FileWriter;
029import java.io.IOException;
030import java.util.ArrayList;
031import java.util.List;
032
033import org.apache.uima.UimaContext;
034import org.apache.uima.analysis_engine.AnalysisEngineDescription;
035import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
036import org.apache.uima.cas.CAS;
037import org.apache.uima.cas.FSIterator;
038import org.apache.uima.cas.text.AnnotationFS;
039import org.apache.uima.jcas.JCas;
040import org.apache.uima.resource.ResourceInitializationException;
041import org.apache.uima.util.XMLSerializer;
042import org.cleartk.util.ViewUriUtil;
043import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
044import org.apache.uima.fit.descriptor.ConfigurationParameter;
045import org.apache.uima.fit.factory.AnalysisEngineFactory;
046import org.xml.sax.ContentHandler;
047import org.xml.sax.SAXException;
048import org.xml.sax.helpers.AttributesImpl;
049
050/**
051 * <br>
052 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
053 * All rights reserved.
054 * 
055 * 
056 * @author Steven Bethard
057 * 
058 */
059public class TempEval2007Writer extends JCasAnnotator_ImplBase {
060
061  public static final String PARAM_OUTPUT_DIRECTORY_NAME = "outputDirectoryName";
062
063  @ConfigurationParameter(
064      name = PARAM_OUTPUT_DIRECTORY_NAME,
065      description = "Provides the path where the TimeML documents should be written.",
066      mandatory = true)
067  private String outputDirectoryName;
068
069  public static AnalysisEngineDescription getDescription(String outputDir)
070      throws ResourceInitializationException {
071    return AnalysisEngineFactory.createEngineDescription(
072        TempEval2007Writer.class,
073        PARAM_OUTPUT_DIRECTORY_NAME,
074        outputDir);
075  }
076
077  private File outputDirectory;
078
079  @Override
080  public void initialize(UimaContext context) throws ResourceInitializationException {
081    super.initialize(context);
082
083    this.outputDirectory = new File(outputDirectoryName);
084    if (!this.outputDirectory.exists()) {
085      this.outputDirectory.mkdirs();
086    }
087  }
088
089  public static String toTimeML(JCas jCas) throws AnalysisEngineProcessException {
090    try {
091      return toXML(jCas.getCas(), new TimeMlAnnotationsToElements());
092    } catch (SAXException e) {
093      throw new AnalysisEngineProcessException(e);
094    }
095  }
096
097  @Override
098  public void process(JCas jCas) throws AnalysisEngineProcessException {
099    String xmlString = toTimeML(jCas);
100
101    // write the TimeML to the output file
102    String filePath = ViewUriUtil.getURI(jCas).getPath();
103    String fileName = new File(filePath).getName();
104    if (!fileName.endsWith(".tml")) {
105      fileName += ".tml";
106    }
107    File outputFile = new File(this.outputDirectory, fileName);
108    try {
109      FileWriter writer = new FileWriter(outputFile);
110      writer.write(xmlString);
111      writer.close();
112    } catch (IOException e) {
113      throw new AnalysisEngineProcessException(e);
114    }
115  }
116
117  public void setOutputDirectoryName(String outputDirectoryName) {
118    this.outputDirectoryName = outputDirectoryName;
119  }
120
121  private static interface AnnotationsToElements {
122    public void startRootElement(ContentHandler handler) throws SAXException;
123
124    public void endRootElement(ContentHandler handler) throws SAXException;
125
126    public void startAnnotationElement(AnnotationFS annotation, ContentHandler handler)
127        throws SAXException;
128
129    public void endAnnotationElement(AnnotationFS annotation, ContentHandler handler)
130        throws SAXException;
131  }
132
133  private static class TimeMlAnnotationsToElements implements AnnotationsToElements {
134
135    public TimeMlAnnotationsToElements() {
136    }
137
138    @Override
139    public void startRootElement(ContentHandler handler) throws SAXException {
140      handler.startElement("", "TimeML", "TimeML", new AttributesImpl());
141    }
142
143    @Override
144    public void endRootElement(ContentHandler handler) throws SAXException {
145      handler.endElement("", "TimeML", "TimeML");
146    }
147
148    @Override
149    public void startAnnotationElement(AnnotationFS annotation, ContentHandler handler)
150        throws SAXException {
151      String name = TimeMlUtil.toTimeMLElementName(annotation);
152      if (name != null) {
153        handler.startElement("", name, name, TimeMlUtil.toTempEval2007Attributes(annotation, name));
154      }
155    }
156
157    @Override
158    public void endAnnotationElement(AnnotationFS annotation, ContentHandler handler)
159        throws SAXException {
160      String name = TimeMlUtil.toTimeMLElementName(annotation);
161      if (name != null) {
162        handler.endElement("", name, name);
163      }
164    }
165  }
166
167  /**
168   * Copied and modified from {@link org.apache.uima.util.CasToInlineXml}
169   */
170  private static String toXML(CAS cas, AnnotationsToElements converter) throws SAXException {
171    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
172    XMLSerializer sax2xml = new XMLSerializer(byteArrayOutputStream, false);
173
174    // get document text
175    String docText = cas.getDocumentText();
176    char[] docCharArray = docText.toCharArray();
177
178    // get iterator over annotations sorted by increasing start position and
179    // decreasing end position
180    FSIterator<AnnotationFS> iterator = cas.getAnnotationIndex().iterator();
181
182    // This is basically a recursive algorithm that has had the recursion
183    // removed through the use of an explicit Stack. We iterate over the
184    // annotations, and if an annotation contains other annotations, we
185    // push the parent annotation on the stack, process the children, and
186    // then come back to the parent later.
187    List<AnnotationFS> stack = new ArrayList<AnnotationFS>();
188    int pos = 0;
189
190    ContentHandler handler = sax2xml.getContentHandler();
191    handler.startDocument();
192    // write the start tag
193    converter.startRootElement(handler);
194    // now use null is a placeholder for this artificial Document annotation
195    AnnotationFS curAnnot = null;
196
197    while (iterator.isValid()) {
198      AnnotationFS nextAnnot = iterator.get();
199
200      if (curAnnot == null || nextAnnot.getBegin() < curAnnot.getEnd()) {
201        // nextAnnot's start point is within the span of curAnnot
202        if (curAnnot == null || nextAnnot.getEnd() <= curAnnot.getEnd()) // crossover span check
203        {
204          // nextAnnot is contained within curAnnot
205
206          // write text between current pos and beginning of nextAnnot
207          try {
208            handler.characters(docCharArray, pos, nextAnnot.getBegin() - pos);
209            pos = nextAnnot.getBegin();
210            converter.startAnnotationElement(nextAnnot, handler);
211
212            // push parent annotation on stack
213            stack.add(curAnnot);
214            // move on to next annotation
215            curAnnot = nextAnnot;
216          } catch (StringIndexOutOfBoundsException e) {
217            System.err.println("Invalid annotation range: " + nextAnnot.getBegin() + ","
218                + nextAnnot.getEnd() + " in document of length " + docText.length());
219          }
220        }
221        iterator.moveToNext();
222      } else {
223        // nextAnnot begins after curAnnot ends
224        // write text between current pos and end of curAnnot
225        try {
226          handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
227          pos = curAnnot.getEnd();
228        } catch (StringIndexOutOfBoundsException e) {
229          System.err.println("Invalid annotation range: " + curAnnot.getBegin() + ","
230              + curAnnot.getEnd() + " in document of length " + docText.length());
231        }
232        converter.endAnnotationElement(curAnnot, handler);
233
234        // pop next containing annotation off stack
235        curAnnot = stack.remove(stack.size() - 1);
236      }
237    }
238
239    // finished writing all start tags, now finish up
240    if (curAnnot != null) {
241      try {
242        handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
243        pos = curAnnot.getEnd();
244      } catch (StringIndexOutOfBoundsException e) {
245        System.err.println("Invalid annotation range: " + curAnnot.getBegin() + ","
246            + curAnnot.getEnd() + "in document of length " + docText.length());
247      }
248      converter.endAnnotationElement(curAnnot, handler);
249
250      while (!stack.isEmpty()) {
251        curAnnot = stack.remove(stack.size() - 1); // pop
252        if (curAnnot == null) {
253          break;
254        }
255        try {
256          handler.characters(docCharArray, pos, curAnnot.getEnd() - pos);
257          pos = curAnnot.getEnd();
258        } catch (StringIndexOutOfBoundsException e) {
259          System.err.println("Invalid annotation range: " + curAnnot.getBegin() + ","
260              + curAnnot.getEnd() + "in document of length " + docText.length());
261        }
262        converter.endAnnotationElement(curAnnot, handler);
263      }
264    }
265
266    if (pos < docCharArray.length) {
267      handler.characters(docCharArray, pos, docCharArray.length - pos);
268    }
269    converter.endRootElement(handler);
270    handler.endDocument();
271
272    // return XML string
273    return new String(byteArrayOutputStream.toByteArray());
274  }
275}