001/*
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.timeml.eval;
025
026import java.io.File;
027import java.io.FileInputStream;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.HashMap;
032import java.util.HashSet;
033import java.util.List;
034import java.util.Map;
035import java.util.Set;
036
037import org.apache.uima.analysis_engine.AnalysisEngine;
038import org.apache.uima.analysis_engine.AnalysisEngineDescription;
039import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
040import org.apache.uima.cas.CAS;
041import org.apache.uima.cas.impl.XmiCasDeserializer;
042import org.apache.uima.collection.CollectionReader;
043import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
044import org.apache.uima.fit.descriptor.ConfigurationParameter;
045import org.apache.uima.fit.factory.AggregateBuilder;
046import org.apache.uima.fit.factory.AnalysisEngineFactory;
047import org.apache.uima.fit.pipeline.JCasIterator;
048import org.apache.uima.fit.pipeline.SimplePipeline;
049import org.apache.uima.jcas.JCas;
050import org.cleartk.corpus.timeml.TempEval2010CollectionReader;
051import org.cleartk.corpus.timeml.TempEval2010GoldAnnotator;
052import org.cleartk.corpus.timeml.TempEval2010Writer;
053import org.cleartk.eval.AnnotationStatistics;
054import org.cleartk.eval.Evaluation_ImplBase;
055import org.cleartk.ml.jar.JarClassifierBuilder;
056import org.cleartk.util.ViewUriUtil;
057import org.cleartk.util.ae.XmiWriter;
058import org.xml.sax.SAXException;
059
060/**
061 * <br>
062 * Copyright (c) 2011, Regents of the University of Colorado <br>
063 * All rights reserved.
064 * 
065 * @author Steven Bethard
066 */
067public class TempEval2010Evaluation extends
068    Evaluation_ImplBase<String, Map<ModelInfo<?>, AnnotationStatistics<String>>> {
069
070  public static final String GOLD_VIEW_NAME = "GoldView";
071
072  public static final String SYSTEM_VIEW_NAME = CAS.NAME_DEFAULT_SOFA;
073
074  private File trainDir;
075
076  private File testDir;
077
078  private List<File> dataDirs;
079
080  private List<String> goldAnnotatorParamsForViewsRequiredBySystem;
081
082  private String goldAnnotatorParamForViewAnnotatedBySystem;
083
084  private String timemlWriterParamForViewAnnotatedBySystem;
085
086  private List<AnalysisEngineDescription> preprocessingAnnotators;
087
088  private List<? extends ModelInfo<?>> modelInfos;
089
090  public TempEval2010Evaluation(
091      File trainDir,
092      File testDir,
093      File outputDirectory,
094      List<String> goldAnnotatorParamsForViewsRequiredBySystem,
095      String goldAnnotatorParamForViewAnnotatedBySystem,
096      String timemlWriterParamForViewAnnotatedBySystem,
097      List<AnalysisEngineDescription> preprocessingAnnotators,
098      List<? extends ModelInfo<?>> modelInfos) throws Exception {
099    super(outputDirectory);
100
101    this.trainDir = trainDir;
102    this.testDir = testDir;
103    this.dataDirs = Arrays.asList(trainDir, testDir);
104    this.goldAnnotatorParamsForViewsRequiredBySystem = goldAnnotatorParamsForViewsRequiredBySystem;
105    this.goldAnnotatorParamForViewAnnotatedBySystem = goldAnnotatorParamForViewAnnotatedBySystem;
106    this.timemlWriterParamForViewAnnotatedBySystem = timemlWriterParamForViewAnnotatedBySystem;
107    this.preprocessingAnnotators = preprocessingAnnotators;
108    this.modelInfos = modelInfos;
109  }
110
111  @Override
112  protected CollectionReader getCollectionReader(List<String> items) throws Exception {
113    return TempEval2010CollectionReader.getCollectionReader(this.dataDirs, new HashSet<String>(
114        items));
115  }
116
117  @Override
118  protected void train(CollectionReader collectionReader, File directory) throws Exception {
119
120    // run the XMI reader and the classifier data writers
121    AggregateBuilder builder = new AggregateBuilder();
122    builder.add(AnalysisEngineFactory.createEngineDescription(
123        XMIReader.class,
124        XMIReader.PARAM_XMI_DIRECTORY,
125        this.getXMIDirectory(directory, Stage.TRAIN).getPath()));
126    for (ModelInfo<?> modelInfo : this.modelInfos) {
127      File outputDir = modelInfo.getModelSubdirectory(directory);
128      builder.add(modelInfo.modelFactory.getWriterDescription(outputDir));
129    }
130    SimplePipeline.runPipeline(collectionReader, builder.createAggregateDescription());
131
132    // train the classifiers
133    for (ModelInfo<?> modelInfo : this.modelInfos) {
134      File modelDir = modelInfo.getModelSubdirectory(directory);
135      JarClassifierBuilder.trainAndPackage(modelDir, modelInfo.trainingArguments);
136    }
137
138    // if building to the pre-defined training directory, clean up non-model files
139    for (ModelInfo<?> modelInfo : this.modelInfos) {
140      File modelDir = modelInfo.modelFactory.getTrainingDirectory();
141      if (modelDir.exists()) {
142        for (File file : modelDir.listFiles()) {
143          File modelFile = JarClassifierBuilder.getModelJarFile(modelDir);
144          if (!file.isDirectory() && !file.equals(modelFile)) {
145            file.delete();
146          }
147        }
148      }
149    }
150  }
151
152  @Override
153  protected Map<ModelInfo<?>, AnnotationStatistics<String>> test(
154      CollectionReader collectionReader,
155      File directory) throws Exception {
156    // prepare the XMI reader, the classifiers and the TempEval writer
157    AggregateBuilder builder = new AggregateBuilder();
158    builder.add(AnalysisEngineFactory.createEngineDescription(
159        XMIReader.class,
160        XMIReader.PARAM_XMI_DIRECTORY,
161        this.getXMIDirectory(directory, Stage.TEST).getPath()));
162    for (ModelInfo<?> modelInfo : this.modelInfos) {
163      File modelFile = JarClassifierBuilder.getModelJarFile(modelInfo.getModelSubdirectory(directory));
164      builder.add(modelInfo.modelFactory.getAnnotatorDescription(modelFile.getPath()));
165    }
166    builder.add(AnalysisEngineFactory.createEngineDescription(
167        TempEval2010Writer.class,
168        TempEval2010Writer.PARAM_OUTPUT_DIRECTORY,
169        new File(directory, "eval").getPath(),
170        TempEval2010Writer.PARAM_TEXT_VIEW,
171        SYSTEM_VIEW_NAME,
172        this.timemlWriterParamForViewAnnotatedBySystem,
173        SYSTEM_VIEW_NAME));
174
175    // create statistics for each feature that is classified
176    Map<ModelInfo<?>, AnnotationStatistics<String>> modelInfoToStatistics;
177    modelInfoToStatistics = new HashMap<ModelInfo<?>, AnnotationStatistics<String>>();
178    for (ModelInfo<?> modelInfo : this.modelInfos) {
179      modelInfoToStatistics.put(modelInfo, new AnnotationStatistics<String>());
180    }
181
182    // gather statistics over all the CASes in the test set
183    AnalysisEngine engine = builder.createAggregate();
184    JCasIterator iter = new JCasIterator(collectionReader, engine);
185    while (iter.hasNext()) {
186      JCas jCas = iter.next();
187      JCas goldView = jCas.getView(GOLD_VIEW_NAME);
188      JCas systemView = jCas.getView(SYSTEM_VIEW_NAME);
189      for (ModelInfo<?> modelInfo : this.modelInfos) {
190        AnnotationStatistics<String> statistics = modelInfoToStatistics.get(modelInfo);
191        modelInfo.updateStatistics(statistics, goldView, systemView);
192      }
193    }
194    engine.collectionProcessComplete();
195    return modelInfoToStatistics;
196  }
197
198  private enum Stage {
199    TRAIN, TEST
200  }
201
202  private File getXMIDirectory(File directory, Stage stage) throws Exception {
203    int dotIndex = Math.max(0, this.goldAnnotatorParamForViewAnnotatedBySystem.lastIndexOf('.'));
204    String name = this.goldAnnotatorParamForViewAnnotatedBySystem.substring(dotIndex + 1);
205    File xmiDirectory = new File(new File(new File(directory, "xmi"), name), stage.toString());
206
207    // create XMIs if necessary
208    if (!xmiDirectory.exists()) {
209      Set<String> fileNames = new HashSet<String>();
210      fileNames.addAll(TempEval2010CollectionReader.getAnnotatedFileNames(this.trainDir));
211      fileNames.addAll(TempEval2010CollectionReader.getAnnotatedFileNames(this.testDir));
212      CollectionReader reader = TempEval2010CollectionReader.getCollectionReader(
213          this.dataDirs,
214          fileNames);
215
216      List<String> viewParams = Arrays.asList(
217          TempEval2010GoldAnnotator.PARAM_TEXT_VIEWS,
218          TempEval2010GoldAnnotator.PARAM_DOCUMENT_CREATION_TIME_VIEWS,
219          TempEval2010GoldAnnotator.PARAM_TIME_EXTENT_VIEWS,
220          TempEval2010GoldAnnotator.PARAM_TIME_ATTRIBUTE_VIEWS,
221          TempEval2010GoldAnnotator.PARAM_EVENT_EXTENT_VIEWS,
222          TempEval2010GoldAnnotator.PARAM_EVENT_ATTRIBUTE_VIEWS,
223          TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_EVENT_TO_DOCUMENT_CREATION_TIME_VIEWS,
224          TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_EVENT_TO_SAME_SENTENCE_TIME_VIEWS,
225          TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_EVENT_TO_SUBORDINATED_EVENT_VIEWS,
226          TempEval2010GoldAnnotator.PARAM_TEMPORAL_LINK_MAIN_EVENT_TO_NEXT_SENTENCE_MAIN_EVENT_VIEWS);
227
228      // determine the view parameters for the gold annotator
229      List<Object> goldAnnotatorParams = new ArrayList<Object>();
230      for (String viewParam : viewParams) {
231        goldAnnotatorParams.add(viewParam);
232        String[] paramValue;
233        switch (stage) {
234          case TRAIN:
235            // during training, put all required annotations, and the annotations from which the
236            // models will be trained, in the system view
237            paramValue = this.goldAnnotatorParamsForViewsRequiredBySystem.contains(viewParam)
238                || viewParam.equals(this.goldAnnotatorParamForViewAnnotatedBySystem)
239                ? new String[] { SYSTEM_VIEW_NAME }
240                : new String[] {};
241            break;
242
243          case TEST:
244            // during testing, put required annotation in both views, and the annotations which the
245            // model is supposed to predict only in the gold view
246            if (this.goldAnnotatorParamsForViewsRequiredBySystem.contains(viewParam)) {
247              paramValue = new String[] { SYSTEM_VIEW_NAME, GOLD_VIEW_NAME };
248            } else if (viewParam.equals(this.goldAnnotatorParamForViewAnnotatedBySystem)) {
249              paramValue = new String[] { GOLD_VIEW_NAME };
250            } else {
251              paramValue = new String[] {};
252            }
253            break;
254
255          default:
256            throw new IllegalArgumentException();
257        }
258        goldAnnotatorParams.add(paramValue);
259      }
260
261      // run the gold annotator, the preprocessing annotators, and the XMI writer
262      AggregateBuilder builder = new AggregateBuilder();
263      builder.add(AnalysisEngineFactory.createEngineDescription(
264          TempEval2010GoldAnnotator.class,
265          goldAnnotatorParams.toArray()));
266      for (AnalysisEngineDescription desc : this.preprocessingAnnotators) {
267        builder.add(desc);
268      }
269      builder.add(AnalysisEngineFactory.createEngineDescription(
270          XmiWriter.class,
271          XmiWriter.PARAM_OUTPUT_DIRECTORY,
272          xmiDirectory.getPath()));
273      SimplePipeline.runPipeline(reader, builder.createAggregateDescription());
274    }
275
276    return xmiDirectory;
277  }
278
279  public static class XMIReader extends JCasAnnotator_ImplBase {
280
281    @ConfigurationParameter(
282        name = PARAM_XMI_DIRECTORY,
283        mandatory = true)
284    protected File xmiDirectory;
285
286    public static final String PARAM_XMI_DIRECTORY = "xmiDirectory";
287
288    protected File getFile(JCas jCas) throws AnalysisEngineProcessException {
289      return new File(this.xmiDirectory, ViewUriUtil.getURI(jCas).getFragment() + ".xmi");
290    }
291    @Override
292    public void process(JCas jCas) throws AnalysisEngineProcessException {
293      try {
294        FileInputStream stream = new FileInputStream(this.getFile(jCas));
295        try {
296          XmiCasDeserializer.deserialize(stream, jCas.getCas());
297        } finally {
298          stream.close();
299        }
300      } catch (SAXException e) {
301        throw new AnalysisEngineProcessException(e);
302      } catch (IOException e) {
303        throw new AnalysisEngineProcessException(e);
304      }
305    }
306  }
307
308}