001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.timeml.eval; 025 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.Arrays; 031import java.util.HashMap; 032import java.util.HashSet; 033import java.util.List; 034import java.util.Map; 035import java.util.Set; 036 037import org.apache.uima.analysis_engine.AnalysisEngine; 038import org.apache.uima.analysis_engine.AnalysisEngineDescription; 039import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 040import org.apache.uima.cas.CAS; 041import org.apache.uima.cas.impl.XmiCasDeserializer; 042import org.apache.uima.collection.CollectionReader; 043import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 044import org.apache.uima.fit.descriptor.ConfigurationParameter; 045import org.apache.uima.fit.factory.AggregateBuilder; 046import org.apache.uima.fit.factory.AnalysisEngineFactory; 047import org.apache.uima.fit.pipeline.JCasIterator; 048import org.apache.uima.fit.pipeline.SimplePipeline; 049import org.apache.uima.jcas.JCas; 050import org.cleartk.corpus.timeml.TempEval2010CollectionReader; 051import org.cleartk.corpus.timeml.TempEval2010GoldAnnotator; 052import org.cleartk.corpus.timeml.TempEval2010Writer; 053import org.cleartk.eval.AnnotationStatistics; 054import org.cleartk.eval.Evaluation_ImplBase; 055import org.cleartk.ml.jar.JarClassifierBuilder; 056import org.cleartk.util.ViewUriUtil; 057import org.cleartk.util.ae.XmiWriter; 058import org.xml.sax.SAXException; 059 060/** 061 * <br> 062 * Copyright (c) 2011, Regents of the University of Colorado <br> 063 * All rights reserved. 064 * 065 * @author Steven Bethard 066 */ 067public class TempEval2010Evaluation extends 068 Evaluation_ImplBase<String, Map<ModelInfo<?>, AnnotationStatistics<String>>> { 069 070 public static final String GOLD_VIEW_NAME = "GoldView"; 071 072 public static final String SYSTEM_VIEW_NAME = CAS.NAME_DEFAULT_SOFA; 073 074 private File trainDir; 075 076 private File testDir; 077 078 private List<File> dataDirs; 079 080 private List<String> goldAnnotatorParamsForViewsRequiredBySystem; 081 082 private String goldAnnotatorParamForViewAnnotatedBySystem; 083 084 private String timemlWriterParamForViewAnnotatedBySystem; 085 086 private List<AnalysisEngineDescription> preprocessingAnnotators; 087 088 private List<? extends ModelInfo<?>> modelInfos; 089 090 public TempEval2010Evaluation( 091 File trainDir, 092 File testDir, 093 File outputDirectory, 094 List<String> goldAnnotatorParamsForViewsRequiredBySystem, 095 String goldAnnotatorParamForViewAnnotatedBySystem, 096 String timemlWriterParamForViewAnnotatedBySystem, 097 List<AnalysisEngineDescription> preprocessingAnnotators, 098 List<? extends ModelInfo<?>> modelInfos) throws Exception { 099 super(outputDirectory); 100 101 this.trainDir = trainDir; 102 this.testDir = testDir; 103 this.dataDirs = Arrays.asList(trainDir, testDir); 104 this.goldAnnotatorParamsForViewsRequiredBySystem = goldAnnotatorParamsForViewsRequiredBySystem; 105 this.goldAnnotatorParamForViewAnnotatedBySystem = goldAnnotatorParamForViewAnnotatedBySystem; 106 this.timemlWriterParamForViewAnnotatedBySystem = timemlWriterParamForViewAnnotatedBySystem; 107 this.preprocessingAnnotators = preprocessingAnnotators; 108 this.modelInfos = modelInfos; 109 } 110 111 @Override 112 protected CollectionReader getCollectionReader(List<String> items) throws Exception { 113 return TempEval2010CollectionReader.getCollectionReader(this.dataDirs, new HashSet<String>( 114 items)); 115 } 116 117 @Override 118 protected void train(CollectionReader collectionReader, File directory) throws Exception { 119 120 // run the XMI reader and the classifier data writers 121 AggregateBuilder builder = new AggregateBuilder(); 122 builder.add(AnalysisEngineFactory.createEngineDescription( 123 XMIReader.class, 124 XMIReader.PARAM_XMI_DIRECTORY, 125 this.getXMIDirectory(directory, Stage.TRAIN).getPath())); 126 for (ModelInfo<?> modelInfo : this.modelInfos) { 127 File outputDir = modelInfo.getModelSubdirectory(directory); 128 builder.add(modelInfo.modelFactory.getWriterDescription(outputDir)); 129 } 130 SimplePipeline.runPipeline(collectionReader, builder.createAggregateDescription()); 131 132 // train the classifiers 133 for (ModelInfo<?> modelInfo : this.modelInfos) { 134 File modelDir = modelInfo.getModelSubdirectory(directory); 135 JarClassifierBuilder.trainAndPackage(modelDir, modelInfo.trainingArguments); 136 } 137 138 // if building to the pre-defined training directory, clean up non-model files 139 for (ModelInfo<?> modelInfo : this.modelInfos) { 140 File modelDir = modelInfo.modelFactory.getTrainingDirectory(); 141 if (modelDir.exists()) { 142 for (File file : modelDir.listFiles()) { 143 File modelFile = JarClassifierBuilder.getModelJarFile(modelDir); 144 if (!file.isDirectory() && !file.equals(modelFile)) { 145 file.delete(); 146 } 147 } 148 } 149 } 150 } 151 152 @Override 153 protected Map<ModelInfo<?>, AnnotationStatistics<String>> test( 154 CollectionReader collectionReader, 155 File directory) throws Exception { 156 // prepare the XMI reader, the classifiers and the TempEval writer 157 AggregateBuilder builder = new AggregateBuilder(); 158 builder.add(AnalysisEngineFactory.createEngineDescription( 159 XMIReader.class, 160 XMIReader.PARAM_XMI_DIRECTORY, 161 this.getXMIDirectory(directory, Stage.TEST).getPath())); 162 for (ModelInfo<?> modelInfo : this.modelInfos) { 163 File modelFile = JarClassifierBuilder.getModelJarFile(modelInfo.getModelSubdirectory(directory)); 164 builder.add(modelInfo.modelFactory.getAnnotatorDescription(modelFile.getPath())); 165 } 166 builder.add(AnalysisEngineFactory.createEngineDescription( 167 TempEval2010Writer.class, 168 TempEval2010Writer.PARAM_OUTPUT_DIRECTORY, 169 new File(directory, "eval").getPath(), 170 TempEval2010Writer.PARAM_TEXT_VIEW, 171 SYSTEM_VIEW_NAME, 172 this.timemlWriterParamForViewAnnotatedBySystem, 173 SYSTEM_VIEW_NAME)); 174 175 // create statistics for each feature that is classified 176 Map<ModelInfo<?>, AnnotationStatistics<String>> modelInfoToStatistics; 177 modelInfoToStatistics = new HashMap<ModelInfo<?>, AnnotationStatistics<String>>(); 178 for (ModelInfo<?> modelInfo : this.modelInfos) { 179 modelInfoToStatistics.put(modelInfo, new AnnotationStatistics<String>()); 180 } 181 182 // gather statistics over all the CASes in the test set 183 AnalysisEngine engine = builder.createAggregate(); 184 JCasIterator iter = new JCasIterator(collectionReader, engine); 185 while (iter.hasNext()) { 186 JCas jCas = iter.next(); 187 JCas goldView = jCas.getView(GOLD_VIEW_NAME); 188 JCas systemView = jCas.getView(SYSTEM_VIEW_NAME); 189 for (ModelInfo<?> modelInfo : this.modelInfos) { 190 AnnotationStatistics<String> statistics = modelInfoToStatistics.get(modelInfo); 191 modelInfo.updateStatistics(statistics, goldView, systemView); 192 } 193 } 194 engine.collectionProcessComplete(); 195 return modelInfoToStatistics; 196 } 197 198 private enum Stage { 199 TRAIN, TEST 200 } 201 202 private File getXMIDirectory(File directory, Stage stage) throws Exception { 203 int dotIndex = Math.max(0, this.goldAnnotatorParamForViewAnnotatedBySystem.lastIndexOf('.')); 204 String name = this.goldAnnotatorParamForViewAnnotatedBySystem.substring(dotIndex + 1); 205 File xmiDirectory = new File(new File(new File(directory, "xmi"), name), stage.toString()); 206 207 // create XMIs if necessary 208 if (!xmiDirectory.exists()) { 209 Set<String> fileNames = new HashSet<String>(); 210 fileNames.addAll(TempEval2010CollectionReader.getAnnotatedFileNames(this.trainDir)); 211 fileNames.addAll(TempEval2010CollectionReader.getAnnotatedFileNames(this.testDir)); 212 CollectionReader reader = TempEval2010CollectionReader.getCollectionReader( 213 this.dataDirs, 214 fileNames); 215 216 List<String> viewParams = Arrays.asList( 217 TempEval2010GoldAnnotator.PARAM_TEXT_VIEWS, 218 TempEval2010GoldAnnotator.PARAM_DOCUMENT_CREATION_TIME_VIEWS, 219 TempEval2010GoldAnnotator.PARAM_TIME_EXTENT_VIEWS, 220 TempEval2010GoldAnnotator.PARAM_TIME_ATTRIBUTE_VIEWS, 221 TempEval2010GoldAnnotator.PARAM_EVENT_EXTENT_VIEWS, 222 TempEval2010GoldAnnotator.PARAM_EVENT_ATTRIBUTE_VIEWS, 223 TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_EVENT_TO_DOCUMENT_CREATION_TIME_VIEWS, 224 TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_EVENT_TO_SAME_SENTENCE_TIME_VIEWS, 225 TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_EVENT_TO_SUBORDINATED_EVENT_VIEWS, 226 TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_MAIN_EVENT_TO_NEXT_SENTENCE_MAIN_EVENT_VIEWS); 227 228 // determine the view parameters for the gold annotator 229 List<Object> goldAnnotatorParams = new ArrayList<Object>(); 230 for (String viewParam : viewParams) { 231 goldAnnotatorParams.add(viewParam); 232 String[] paramValue; 233 switch (stage) { 234 case TRAIN: 235 // during training, put all required annotations, and the annotations from which the 236 // models will be trained, in the system view 237 paramValue = this.goldAnnotatorParamsForViewsRequiredBySystem.contains(viewParam) 238 || viewParam.equals(this.goldAnnotatorParamForViewAnnotatedBySystem) 239 ? new String[] { SYSTEM_VIEW_NAME } 240 : new String[] {}; 241 break; 242 243 case TEST: 244 // during testing, put required annotation in both views, and the annotations which the 245 // model is supposed to predict only in the gold view 246 if (this.goldAnnotatorParamsForViewsRequiredBySystem.contains(viewParam)) { 247 paramValue = new String[] { SYSTEM_VIEW_NAME, GOLD_VIEW_NAME }; 248 } else if (viewParam.equals(this.goldAnnotatorParamForViewAnnotatedBySystem)) { 249 paramValue = new String[] { GOLD_VIEW_NAME }; 250 } else { 251 paramValue = new String[] {}; 252 } 253 break; 254 255 default: 256 throw new IllegalArgumentException(); 257 } 258 goldAnnotatorParams.add(paramValue); 259 } 260 261 // run the gold annotator, the preprocessing annotators, and the XMI writer 262 AggregateBuilder builder = new AggregateBuilder(); 263 builder.add(AnalysisEngineFactory.createEngineDescription( 264 TempEval2010GoldAnnotator.class, 265 goldAnnotatorParams.toArray())); 266 for (AnalysisEngineDescription desc : this.preprocessingAnnotators) { 267 builder.add(desc); 268 } 269 builder.add(AnalysisEngineFactory.createEngineDescription( 270 XmiWriter.class, 271 XmiWriter.PARAM_OUTPUT_DIRECTORY, 272 xmiDirectory.getPath())); 273 SimplePipeline.runPipeline(reader, builder.createAggregateDescription()); 274 } 275 276 return xmiDirectory; 277 } 278 279 public static class XMIReader extends JCasAnnotator_ImplBase { 280 281 @ConfigurationParameter( 282 name = PARAM_XMI_DIRECTORY, 283 mandatory = true) 284 protected File xmiDirectory; 285 286 public static final String PARAM_XMI_DIRECTORY = "xmiDirectory"; 287 288 protected File getFile(JCas jCas) throws AnalysisEngineProcessException { 289 return new File(this.xmiDirectory, ViewUriUtil.getURI(jCas).getFragment() + ".xmi"); 290 } 291 @Override 292 public void process(JCas jCas) throws AnalysisEngineProcessException { 293 try { 294 FileInputStream stream = new FileInputStream(this.getFile(jCas)); 295 try { 296 XmiCasDeserializer.deserialize(stream, jCas.getCas()); 297 } finally { 298 stream.close(); 299 } 300 } catch (SAXException e) { 301 throw new AnalysisEngineProcessException(e); 302 } catch (IOException e) { 303 throw new AnalysisEngineProcessException(e); 304 } 305 } 306 } 307 308}