001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.timeml; 025 026import java.io.ByteArrayOutputStream; 027import java.io.File; 028import java.io.FileWriter; 029import java.io.IOException; 030import java.util.ArrayList; 031import java.util.List; 032 033import org.apache.uima.UimaContext; 034import org.apache.uima.analysis_engine.AnalysisEngineDescription; 035import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 036import org.apache.uima.cas.CAS; 037import org.apache.uima.cas.FSIterator; 038import org.apache.uima.cas.text.AnnotationFS; 039import org.apache.uima.jcas.JCas; 040import org.apache.uima.resource.ResourceInitializationException; 041import org.apache.uima.util.XMLSerializer; 042import org.cleartk.util.ViewUriUtil; 043import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 044import org.apache.uima.fit.descriptor.ConfigurationParameter; 045import org.apache.uima.fit.factory.AnalysisEngineFactory; 046import org.xml.sax.ContentHandler; 047import org.xml.sax.SAXException; 048import org.xml.sax.helpers.AttributesImpl; 049 050/** 051 * <br> 052 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 053 * All rights reserved. 054 * 055 * 056 * @author Steven Bethard 057 * 058 */ 059public class TempEval2007Writer extends JCasAnnotator_ImplBase { 060 061 public static final String PARAM_OUTPUT_DIRECTORY_NAME = "outputDirectoryName"; 062 063 @ConfigurationParameter( 064 name = PARAM_OUTPUT_DIRECTORY_NAME, 065 description = "Provides the path where the TimeML documents should be written.", 066 mandatory = true) 067 private String outputDirectoryName; 068 069 public static AnalysisEngineDescription getDescription(String outputDir) 070 throws ResourceInitializationException { 071 return AnalysisEngineFactory.createEngineDescription( 072 TempEval2007Writer.class, 073 PARAM_OUTPUT_DIRECTORY_NAME, 074 outputDir); 075 } 076 077 private File outputDirectory; 078 079 @Override 080 public void initialize(UimaContext context) throws ResourceInitializationException { 081 super.initialize(context); 082 083 this.outputDirectory = new File(outputDirectoryName); 084 if (!this.outputDirectory.exists()) { 085 this.outputDirectory.mkdirs(); 086 } 087 } 088 089 public static String toTimeML(JCas jCas) throws AnalysisEngineProcessException { 090 try { 091 return toXML(jCas.getCas(), new TimeMlAnnotationsToElements()); 092 } catch (SAXException e) { 093 throw new AnalysisEngineProcessException(e); 094 } 095 } 096 097 @Override 098 public void process(JCas jCas) throws AnalysisEngineProcessException { 099 String xmlString = toTimeML(jCas); 100 101 // write the TimeML to the output file 102 String filePath = ViewUriUtil.getURI(jCas).getPath(); 103 String fileName = new File(filePath).getName(); 104 if (!fileName.endsWith(".tml")) { 105 fileName += ".tml"; 106 } 107 File outputFile = new File(this.outputDirectory, fileName); 108 try { 109 FileWriter writer = new FileWriter(outputFile); 110 writer.write(xmlString); 111 writer.close(); 112 } catch (IOException e) { 113 throw new AnalysisEngineProcessException(e); 114 } 115 } 116 117 public void setOutputDirectoryName(String outputDirectoryName) { 118 this.outputDirectoryName = outputDirectoryName; 119 } 120 121 private static interface AnnotationsToElements { 122 public void startRootElement(ContentHandler handler) throws SAXException; 123 124 public void endRootElement(ContentHandler handler) throws SAXException; 125 126 public void startAnnotationElement(AnnotationFS annotation, ContentHandler handler) 127 throws SAXException; 128 129 public void endAnnotationElement(AnnotationFS annotation, ContentHandler handler) 130 throws SAXException; 131 } 132 133 private static class TimeMlAnnotationsToElements implements AnnotationsToElements { 134 135 public TimeMlAnnotationsToElements() { 136 } 137 138 @Override 139 public void startRootElement(ContentHandler handler) throws SAXException { 140 handler.startElement("", "TimeML", "TimeML", new AttributesImpl()); 141 } 142 143 @Override 144 public void endRootElement(ContentHandler handler) throws SAXException { 145 handler.endElement("", "TimeML", "TimeML"); 146 } 147 148 @Override 149 public void startAnnotationElement(AnnotationFS annotation, ContentHandler handler) 150 throws SAXException { 151 String name = TimeMlUtil.toTimeMLElementName(annotation); 152 if (name != null) { 153 handler.startElement("", name, name, TimeMlUtil.toTempEval2007Attributes(annotation, name)); 154 } 155 } 156 157 @Override 158 public void endAnnotationElement(AnnotationFS annotation, ContentHandler handler) 159 throws SAXException { 160 String name = TimeMlUtil.toTimeMLElementName(annotation); 161 if (name != null) { 162 handler.endElement("", name, name); 163 } 164 } 165 } 166 167 /** 168 * Copied and modified from {@link org.apache.uima.util.CasToInlineXml} 169 */ 170 private static String toXML(CAS cas, AnnotationsToElements converter) throws SAXException { 171 ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); 172 XMLSerializer sax2xml = new XMLSerializer(byteArrayOutputStream, false); 173 174 // get document text 175 String docText = cas.getDocumentText(); 176 char[] docCharArray = docText.toCharArray(); 177 178 // get iterator over annotations sorted by increasing start position and 179 // decreasing end position 180 FSIterator<AnnotationFS> iterator = cas.getAnnotationIndex().iterator(); 181 182 // This is basically a recursive algorithm that has had the recursion 183 // removed through the use of an explicit Stack. We iterate over the 184 // annotations, and if an annotation contains other annotations, we 185 // push the parent annotation on the stack, process the children, and 186 // then come back to the parent later. 187 List<AnnotationFS> stack = new ArrayList<AnnotationFS>(); 188 int pos = 0; 189 190 ContentHandler handler = sax2xml.getContentHandler(); 191 handler.startDocument(); 192 // write the start tag 193 converter.startRootElement(handler); 194 // now use null is a placeholder for this artificial Document annotation 195 AnnotationFS curAnnot = null; 196 197 while (iterator.isValid()) { 198 AnnotationFS nextAnnot = iterator.get(); 199 200 if (curAnnot == null || nextAnnot.getBegin() < curAnnot.getEnd()) { 201 // nextAnnot's start point is within the span of curAnnot 202 if (curAnnot == null || nextAnnot.getEnd() <= curAnnot.getEnd()) // crossover span check 203 { 204 // nextAnnot is contained within curAnnot 205 206 // write text between current pos and beginning of nextAnnot 207 try { 208 handler.characters(docCharArray, pos, nextAnnot.getBegin() - pos); 209 pos = nextAnnot.getBegin(); 210 converter.startAnnotationElement(nextAnnot, handler); 211 212 // push parent annotation on stack 213 stack.add(curAnnot); 214 // move on to next annotation 215 curAnnot = nextAnnot; 216 } catch (StringIndexOutOfBoundsException e) { 217 System.err.println("Invalid annotation range: " + nextAnnot.getBegin() + "," 218 + nextAnnot.getEnd() + " in document of length " + docText.length()); 219 } 220 } 221 iterator.moveToNext(); 222 } else { 223 // nextAnnot begins after curAnnot ends 224 // write text between current pos and end of curAnnot 225 try { 226 handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); 227 pos = curAnnot.getEnd(); 228 } catch (StringIndexOutOfBoundsException e) { 229 System.err.println("Invalid annotation range: " + curAnnot.getBegin() + "," 230 + curAnnot.getEnd() + " in document of length " + docText.length()); 231 } 232 converter.endAnnotationElement(curAnnot, handler); 233 234 // pop next containing annotation off stack 235 curAnnot = stack.remove(stack.size() - 1); 236 } 237 } 238 239 // finished writing all start tags, now finish up 240 if (curAnnot != null) { 241 try { 242 handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); 243 pos = curAnnot.getEnd(); 244 } catch (StringIndexOutOfBoundsException e) { 245 System.err.println("Invalid annotation range: " + curAnnot.getBegin() + "," 246 + curAnnot.getEnd() + "in document of length " + docText.length()); 247 } 248 converter.endAnnotationElement(curAnnot, handler); 249 250 while (!stack.isEmpty()) { 251 curAnnot = stack.remove(stack.size() - 1); // pop 252 if (curAnnot == null) { 253 break; 254 } 255 try { 256 handler.characters(docCharArray, pos, curAnnot.getEnd() - pos); 257 pos = curAnnot.getEnd(); 258 } catch (StringIndexOutOfBoundsException e) { 259 System.err.println("Invalid annotation range: " + curAnnot.getBegin() + "," 260 + curAnnot.getEnd() + "in document of length " + docText.length()); 261 } 262 converter.endAnnotationElement(curAnnot, handler); 263 } 264 } 265 266 if (pos < docCharArray.length) { 267 handler.characters(docCharArray, pos, docCharArray.length - pos); 268 } 269 converter.endRootElement(handler); 270 handler.endDocument(); 271 272 // return XML string 273 return new String(byteArrayOutputStream.toByteArray()); 274 } 275}