001/* 002 * Copyright (c) 2013, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.timeml.eval; 025 026import java.io.File; 027import java.io.IOException; 028import java.net.URI; 029import java.util.LinkedHashSet; 030import java.util.List; 031import java.util.Map; 032import java.util.Queue; 033import java.util.Set; 034 035import org.apache.uima.UimaContext; 036import org.apache.uima.analysis_engine.AnalysisEngine; 037import org.apache.uima.analysis_engine.AnalysisEngineDescription; 038import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 039import org.apache.uima.cas.CAS; 040import org.apache.uima.cas.CASException; 041import org.apache.uima.cas.Feature; 042import org.apache.uima.collection.CollectionReader; 043import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 044import org.apache.uima.fit.component.ViewCreatorAnnotator; 045import org.apache.uima.fit.descriptor.ConfigurationParameter; 046import org.apache.uima.fit.factory.AggregateBuilder; 047import org.apache.uima.fit.factory.AnalysisEngineFactory; 048import org.apache.uima.fit.pipeline.JCasIterator; 049import org.apache.uima.fit.pipeline.SimplePipeline; 050import org.apache.uima.fit.util.JCasUtil; 051import org.apache.uima.jcas.JCas; 052import org.apache.uima.jcas.cas.TOP; 053import org.apache.uima.resource.ResourceInitializationException; 054import org.apache.uima.util.CasCopier; 055import org.cleartk.corpus.timeml.PlainTextTlinkGoldAnnotator; 056import org.cleartk.corpus.timeml.TempEval2013Writer; 057import org.cleartk.corpus.timeml.TimeMlGoldAnnotator; 058import org.cleartk.eval.AnnotationStatistics; 059import org.cleartk.eval.Evaluation_ImplBase; 060import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter; 061import org.cleartk.opennlp.tools.ParserAnnotator; 062import org.cleartk.opennlp.tools.PosTaggerAnnotator; 063import org.cleartk.opennlp.tools.SentenceAnnotator; 064import org.cleartk.snowball.DefaultSnowballStemmer; 065import org.cleartk.timeml.event.EventAnnotator; 066import org.cleartk.timeml.event.EventAspectAnnotator; 067import org.cleartk.timeml.event.EventClassAnnotator; 068import org.cleartk.timeml.event.EventModalityAnnotator; 069import org.cleartk.timeml.event.EventPolarityAnnotator; 070import org.cleartk.timeml.event.EventTenseAnnotator; 071import org.cleartk.timeml.time.TimeAnnotator; 072import org.cleartk.timeml.time.TimeTypeAnnotator; 073import org.cleartk.timeml.tlink.TemporalLinkAnnotator_ImplBase; 074import org.cleartk.timeml.tlink.TemporalLinkEventToDocumentCreationTimeAnnotator; 075import org.cleartk.timeml.tlink.TemporalLinkEventToSameSentenceTimeAnnotator; 076import org.cleartk.timeml.tlink.TemporalLinkEventToSubordinatedEventAnnotator; 077import org.cleartk.timeml.type.Anchor; 078import org.cleartk.timeml.type.DocumentCreationTime; 079import org.cleartk.timeml.type.Event; 080import org.cleartk.timeml.type.TemporalLink; 081import org.cleartk.timeml.type.Text; 082import org.cleartk.timeml.type.Time; 083import org.cleartk.token.tokenizer.TokenAnnotator; 084import org.cleartk.util.ViewUriUtil; 085import org.cleartk.util.ae.UriToDocumentTextAnnotator; 086import org.cleartk.util.cr.UriCollectionReader; 087import org.jdom2.Document; 088import org.jdom2.Element; 089import org.jdom2.JDOMException; 090import org.jdom2.filter.Filters; 091import org.jdom2.input.SAXBuilder; 092import org.jdom2.output.XMLOutputter; 093 094import com.google.common.base.Function; 095import com.google.common.collect.ImmutableMultimap; 096import com.google.common.collect.ImmutableTable; 097import com.google.common.collect.LinkedHashMultimap; 098import com.google.common.collect.Lists; 099import com.google.common.collect.Maps; 100import com.google.common.collect.Multimap; 101import com.google.common.collect.Queues; 102import com.google.common.collect.Sets; 103import com.google.common.collect.Table; 104import com.lexicalscope.jewel.cli.CliFactory; 105import com.lexicalscope.jewel.cli.Option; 106 107/** 108 * Trains and evaluates event, time and temporal relation models on the TempEval 2013 data. 109 * 110 * <br> 111 * Copyright (c) 2013, Regents of the University of Colorado <br> 112 * All rights reserved. 113 * 114 * @author Steven Bethard 115 */ 116public class TempEval2013Evaluation 117 extends 118 Evaluation_ImplBase<File, ImmutableTable<Model<?>, Model.Params, AnnotationStatistics<String>>> { 119 120 interface Options { 121 122 @Option(longName = "train-dirs") 123 List<File> getTrainDirectories(); 124 125 @Option(longName = "test-dirs", defaultToNull = true) 126 List<File> getTestDirectories(); 127 128 @Option(longName = "inferred-tlinks", defaultToNull = true) 129 List<File> getInferredTLinksDirectories(); 130 131 @Option(longName = "verb-clause-tlinks") 132 boolean getVerbClauseTLinks(); 133 134 @Option(longName = "relations-only") 135 boolean getRelationsOnly(); 136 137 @Option(longName = "tune", defaultToNull = true) 138 String getNameOfModelToTune(); 139 140 @Option(longName = "train-only") 141 boolean getTrainOnly(); 142 } 143 144 public static void main(String[] args) throws Exception { 145 Options options = CliFactory.parseArguments(Options.class, args); 146 147 List<File> trainFiles = listAllFiles(options.getTrainDirectories()); 148 List<File> testFiles = listAllFiles(options.getTestDirectories()); 149 150 // map names to models 151 List<Model<?>> allModels = Lists.<Model<?>> newArrayList( 152 TIME_EXTENT_MODEL, 153 TIME_TYPE_MODEL, 154 EVENT_EXTENT_MODEL, 155 EVENT_ASPECT_MODEL, 156 EVENT_CLASS_MODEL, 157 EVENT_MODALITY_MODEL, 158 EVENT_POLARITY_MODEL, 159 EVENT_TENSE_MODEL, 160 TLINK_EVENT_DOCTIME_MODEL, 161 TLINK_EVENT_SENTTIME_MODEL, 162 TLINK_EVENT_SUBORDEVENT_MODEL); 163 Map<String, Model<?>> nameToModel = Maps.newHashMap(); 164 for (Model<?> model : allModels) { 165 nameToModel.put(model.name, model); 166 } 167 168 // determine which parameters each model should be trained with 169 ImmutableMultimap.Builder<Model<?>, Model.Params> modelsBuilder = ImmutableMultimap.builder(); 170 String nameOfModelToTune = options.getNameOfModelToTune(); 171 if (nameOfModelToTune == null) { 172 for (Model<?> model : allModels) { 173 if (!options.getRelationsOnly() || model.name.startsWith("tlink")) { 174 modelsBuilder.put(model, model.bestParams); 175 } 176 } 177 } else { 178 Model<?> modelToTune = nameToModel.get(nameOfModelToTune); 179 if (modelToTune == null) { 180 throw new IllegalArgumentException("No such model: " + nameOfModelToTune); 181 } 182 for (Model<?> model : getSortedPrerequisites(modelToTune)) { 183 if (!options.getRelationsOnly() || model.name.startsWith("tlink")) { 184 modelsBuilder.put(model, model.bestParams); 185 } 186 } 187 for (Model.Params params : modelToTune.paramsToSearch) { 188 modelsBuilder.put(modelToTune, params); 189 } 190 } 191 ImmutableMultimap<Model<?>, Model.Params> models = modelsBuilder.build(); 192 193 // create the evaluation manager 194 File evalDir = new File("target/tempeval2013"); 195 TempEval2013Evaluation evaluation = new TempEval2013Evaluation( 196 evalDir, 197 models, 198 options.getInferredTLinksDirectories(), 199 options.getVerbClauseTLinks(), 200 options.getRelationsOnly()); 201 202 // just train a model 203 if (options.getTrainOnly()) { 204 if (!testFiles.isEmpty()) { 205 throw new IllegalArgumentException("Cannot specify test files when only training"); 206 } 207 evaluation.train(evaluation.getCollectionReader(trainFiles), Model.DEFAULT_DIRECTORY); 208 for (Model<?> model : models.keySet()) { 209 for (Model.Params params : models.get(model)) { 210 model.cleanTrainingFiles(Model.DEFAULT_DIRECTORY, params); 211 } 212 } 213 } else { 214 215 // run a simple train-and-test 216 ImmutableTable<Model<?>, Model.Params, AnnotationStatistics<String>> modelStats; 217 if (!testFiles.isEmpty()) { 218 modelStats = evaluation.trainAndTest(trainFiles, testFiles); 219 } 220 221 // run a cross-validation 222 else { 223 List<ImmutableTable<Model<?>, Model.Params, AnnotationStatistics<String>>> foldStats; 224 foldStats = evaluation.crossValidation(trainFiles, 2); 225 226 // prepare a table of stats for all models and parameters 227 ImmutableTable.Builder<Model<?>, Model.Params, AnnotationStatistics<String>> modelStatsBuilder = ImmutableTable.builder(); 228 for (Model<?> model : models.keySet()) { 229 for (Model.Params params : models.get(model)) { 230 modelStatsBuilder.put(model, params, new AnnotationStatistics<String>()); 231 } 232 } 233 modelStats = modelStatsBuilder.build(); 234 235 // combine all fold stats into a single overall stats 236 for (Table<Model<?>, Model.Params, AnnotationStatistics<String>> foldTable : foldStats) { 237 for (Table.Cell<Model<?>, Model.Params, AnnotationStatistics<String>> cell : foldTable.cellSet()) { 238 modelStats.get(cell.getRowKey(), cell.getColumnKey()).addAll(cell.getValue()); 239 } 240 } 241 } 242 243 // print out all model performance 244 for (Model<?> model : models.keySet()) { 245 for (Model.Params params : modelStats.row(model).keySet()) { 246 System.err.printf("== %s %s ==\n", model.name, params); 247 System.err.println(modelStats.get(model, params)); 248 } 249 } 250 } 251 } 252 253 private static List<File> listAllFiles(List<File> directories) { 254 List<File> files = Lists.newArrayList(); 255 if (directories != null) { 256 for (File dir : directories) { 257 for (File file : dir.listFiles()) { 258 if (!file.getName().startsWith(".") && !file.isHidden()) { 259 files.add(file); 260 } 261 } 262 } 263 } 264 return files; 265 } 266 267 private static Set<Model<?>> getPrerequisites(Model<?> model) { 268 Set<Model<?>> prereqs = Sets.newLinkedHashSet(); 269 for (Model<?> prereq : model.prerequisites) { 270 prereqs.add(prereq); 271 prereqs.addAll(getPrerequisites(prereq)); 272 } 273 return prereqs; 274 } 275 276 private static LinkedHashSet<Model<?>> getSortedPrerequisites(Model<?> model) { 277 Queue<Model<?>> todo = Queues.newArrayDeque(); 278 Multimap<Model<?>, Model<?>> following = LinkedHashMultimap.create(); 279 for (Model<?> prereq : getPrerequisites(model)) { 280 if (prereq.prerequisites.isEmpty()) { 281 todo.add(prereq); 282 } else { 283 for (Model<?> preprereq : prereq.prerequisites) { 284 following.put(preprereq, prereq); 285 } 286 } 287 } 288 LinkedHashSet<Model<?>> models = Sets.newLinkedHashSet(); 289 while (!todo.isEmpty()) { 290 Model<?> next = todo.iterator().next(); 291 todo.remove(next); 292 models.add(next); 293 for (Model<?> prereq : following.removeAll(next)) { 294 if (!following.containsKey(prereq)) { 295 todo.add(prereq); 296 } 297 } 298 } 299 return models; 300 } 301 302 private static Function<TemporalLink, List<Integer>> TEMPORAL_LINK_TO_SPANS = new Function<TemporalLink, List<Integer>>() { 303 @Override 304 public List<Integer> apply(TemporalLink temporalLink) { 305 // order source and target indexes, left-to-right 306 Anchor source = temporalLink.getSource(); 307 Anchor target = temporalLink.getTarget(); 308 return source.getBegin() < target.getBegin() 309 ? Lists.newArrayList(source.getBegin(), source.getEnd(), target.getBegin(), target.getEnd()) 310 : Lists.newArrayList(target.getBegin(), target.getEnd(), source.getBegin(), source.getEnd()); 311 } 312 }; 313 314 private static Function<TemporalLink, String> TEMPORAL_LINK_TO_RELATION = new Function<TemporalLink, String>() { 315 @Override 316 public String apply(TemporalLink temporalLink) { 317 // match relation with left-to-right ordering of indexes 318 Anchor source = temporalLink.getSource(); 319 Anchor target = temporalLink.getTarget(); 320 return source.getBegin() < target.getBegin() 321 ? temporalLink.getRelationType() 322 : TemporalLinkAnnotator_ImplBase.REVERSE_RELATION.get(temporalLink.getRelationType()); 323 } 324 }; 325 326 private static List<Model.Params> SEQUENCE_CLASSIFIER_PARAM_SEARCH_SPACE = Lists.newArrayList( 327 // L2-regularized L2-loss support vector classification (dual) 328 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "0.1", "-s", "1"), 329 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "0.5", "-s", "1"), 330 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "1", "-s", "1"), 331 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "5", "-s", "1"), 332 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "10", "-s", "1"), 333 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "50", "-s", "1"), 334 new Model.Params(LibLinearStringOutcomeDataWriter.class, 2, "-c", "0.1", "-s", "1"), 335 new Model.Params(LibLinearStringOutcomeDataWriter.class, 2, "-c", "0.5", "-s", "1"), 336 new Model.Params(LibLinearStringOutcomeDataWriter.class, 2, "-c", "1", "-s", "1"), 337 new Model.Params(LibLinearStringOutcomeDataWriter.class, 2, "-c", "5", "-s", "1"), 338 new Model.Params(LibLinearStringOutcomeDataWriter.class, 2, "-c", "10", "-s", "1"), 339 new Model.Params(LibLinearStringOutcomeDataWriter.class, 2, "-c", "50", "-s", "1"), 340 new Model.Params(LibLinearStringOutcomeDataWriter.class, 3, "-c", "0.1", "-s", "1"), 341 new Model.Params(LibLinearStringOutcomeDataWriter.class, 3, "-c", "0.5", "-s", "1"), 342 new Model.Params(LibLinearStringOutcomeDataWriter.class, 3, "-c", "1", "-s", "1"), 343 new Model.Params(LibLinearStringOutcomeDataWriter.class, 3, "-c", "5", "-s", "1"), 344 new Model.Params(LibLinearStringOutcomeDataWriter.class, 3, "-c", "10", "-s", "1"), 345 new Model.Params(LibLinearStringOutcomeDataWriter.class, 3, "-c", "50", "-s", "1")); 346// // default is --iterations 500 --gaussian-variance 10 347// new Model.Params(MalletCRFStringOutcomeDataWriter.class), 348// new Model.Params(MalletCRFStringOutcomeDataWriter.class, "--forbidden", "O,I"), 349// new Model.Params(MalletCRFStringOutcomeDataWriter.class, "--iterations", "100"), 350// new Model.Params(MalletCRFStringOutcomeDataWriter.class, "--iterations", "1000"), 351// new Model.Params(MalletCRFStringOutcomeDataWriter.class, "--gaussian-variance", "1"), 352// new Model.Params(MalletCRFStringOutcomeDataWriter.class, "--gaussian-variance", "100")); 353 354// private static final String priorFlag = "--gaussianPriorVariance"; 355 356 private static List<Model.Params> CLASSIFIER_PARAM_SEARCH_SPACE = Lists.newArrayList( 357// // default is --gaussianPriorVariance 1 358// new Model.Params(MalletStringOutcomeDataWriter.class, "MaxEnt"), 359// new Model.Params(MalletStringOutcomeDataWriter.class, "MaxEnt", priorFlag, "0.1"), 360// new Model.Params(MalletStringOutcomeDataWriter.class, "MaxEnt", priorFlag, "10"), 361// // default is [iterations cutoff] 100 5 362// new Model.Params(MaxentStringOutcomeDataWriter.class), 363// new Model.Params(MaxentStringOutcomeDataWriter.class, "100", "10"), 364// new Model.Params(MaxentStringOutcomeDataWriter.class, "500", "5"), 365 // L2-regularized logistic regression (primal) 366 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.1", "-s", "0"), 367 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.5", "-s", "0"), 368 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "1", "-s", "0"), 369 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "5", "-s", "0"), 370 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "10", "-s", "0"), 371 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "50", "-s", "0"), 372 // L2-regularized L2-loss support vector classification (dual) 373 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.1", "-s", "1"), 374 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.5", "-s", "1"), 375 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "1", "-s", "1"), 376 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "5", "-s", "1"), 377 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "10", "-s", "1"), 378 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "50", "-s", "1")); 379 380 private static final Model<Time> TIME_EXTENT_MODEL = new Model<Time>( 381 "time-extent", 382 Lists.<Model<?>> newArrayList(), 383 TimeAnnotator.class, 384 new Model.Params(LibLinearStringOutcomeDataWriter.class, 1, "-c", "0.1", "-s", "1"), 385 SEQUENCE_CLASSIFIER_PARAM_SEARCH_SPACE, 386 Model.EvaluationType.NORMAL, 387 Model.LoggingType.NONE, 388 Time.class, 389 AnnotationStatistics.<Time> annotationToSpan(), 390 AnnotationStatistics.<Time, String> annotationToNull(), 391 null); 392 393 private static final Model<Time> TIME_TYPE_MODEL = new Model<Time>( 394 "time-type", 395 Lists.<Model<?>> newArrayList(TIME_EXTENT_MODEL), 396 TimeTypeAnnotator.class, 397 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "5", "-s", "0"), 398 CLASSIFIER_PARAM_SEARCH_SPACE, 399 Model.EvaluationType.NORMAL, 400 Model.LoggingType.NONE, 401 Time.class, 402 AnnotationStatistics.<Time> annotationToSpan(), 403 AnnotationStatistics.<Time> annotationToFeatureValue("timeType"), 404 "timeType"); 405 406 private static final Model<Event> EVENT_EXTENT_MODEL = new Model<Event>( 407 "event-extent", 408 Lists.<Model<?>> newArrayList(), 409 EventAnnotator.class, 410 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.1", "-s", "1"), 411 CLASSIFIER_PARAM_SEARCH_SPACE, 412 Model.EvaluationType.NORMAL, 413 Model.LoggingType.NONE, 414 Event.class, 415 AnnotationStatistics.<Event> annotationToSpan(), 416 AnnotationStatistics.<Event, String> annotationToNull(), 417 null); 418 419 private static final Model<Event> EVENT_ASPECT_MODEL = new Model<Event>( 420 "event-aspect", 421 Lists.<Model<?>> newArrayList(EVENT_EXTENT_MODEL), 422 EventAspectAnnotator.class, 423 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "5", "-s", "0"), 424 CLASSIFIER_PARAM_SEARCH_SPACE, 425 Model.EvaluationType.NORMAL, 426 Model.LoggingType.NONE, 427 Event.class, 428 AnnotationStatistics.<Event> annotationToSpan(), 429 AnnotationStatistics.<Event> annotationToFeatureValue("aspect"), 430 "aspect"); 431 432 private static final Model<Event> EVENT_CLASS_MODEL = new Model<Event>( 433 "event-class", 434 Lists.<Model<?>> newArrayList(EVENT_EXTENT_MODEL), 435 EventClassAnnotator.class, 436 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "5", "-s", "0"), 437 CLASSIFIER_PARAM_SEARCH_SPACE, 438 Model.EvaluationType.NORMAL, 439 Model.LoggingType.NONE, 440 Event.class, 441 AnnotationStatistics.<Event> annotationToSpan(), 442 AnnotationStatistics.<Event> annotationToFeatureValue("eventClass"), 443 "eventClass"); 444 445 private static final Model<Event> EVENT_MODALITY_MODEL = new Model<Event>( 446 "event-modality", 447 Lists.<Model<?>> newArrayList(EVENT_EXTENT_MODEL), 448 EventModalityAnnotator.class, 449 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "1", "-s", "1"), 450 CLASSIFIER_PARAM_SEARCH_SPACE, 451 Model.EvaluationType.NORMAL, 452 Model.LoggingType.NONE, 453 Event.class, 454 AnnotationStatistics.<Event> annotationToSpan(), 455 AnnotationStatistics.<Event> annotationToFeatureValue("modality"), 456 "modality"); 457 458 private static final Model<Event> EVENT_POLARITY_MODEL = new Model<Event>( 459 "event-polarity", 460 Lists.<Model<?>> newArrayList(EVENT_EXTENT_MODEL), 461 EventPolarityAnnotator.class, 462 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "1", "-s", "1"), 463 CLASSIFIER_PARAM_SEARCH_SPACE, 464 Model.EvaluationType.NORMAL, 465 Model.LoggingType.NONE, 466 Event.class, 467 AnnotationStatistics.<Event> annotationToSpan(), 468 AnnotationStatistics.<Event> annotationToFeatureValue("polarity"), 469 "polarity"); 470 471 private static final Model<Event> EVENT_TENSE_MODEL = new Model<Event>( 472 "event-tense", 473 Lists.<Model<?>> newArrayList(EVENT_EXTENT_MODEL), 474 EventTenseAnnotator.class, 475 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.5", "-s", "1"), 476 CLASSIFIER_PARAM_SEARCH_SPACE, 477 Model.EvaluationType.NORMAL, 478 Model.LoggingType.NONE, 479 Event.class, 480 AnnotationStatistics.<Event> annotationToSpan(), 481 AnnotationStatistics.<Event> annotationToFeatureValue("tense"), 482 "tense"); 483 484 private static final Model<TemporalLink> TLINK_EVENT_DOCTIME_MODEL = new Model<TemporalLink>( 485 "tlink-event-doctime", 486 Lists.<Model<?>> newArrayList( 487 EVENT_EXTENT_MODEL, 488 EVENT_ASPECT_MODEL, 489 EVENT_CLASS_MODEL, 490 EVENT_MODALITY_MODEL, 491 EVENT_POLARITY_MODEL, 492 EVENT_TENSE_MODEL), 493 TemporalLinkEventToDocumentCreationTimeAnnotator.class, 494 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "50", "-s", "1"), 495 CLASSIFIER_PARAM_SEARCH_SPACE, 496 Model.EvaluationType.INTERSECTED_SPANS, 497 Model.LoggingType.NONE, 498 TemporalLink.class, 499 TEMPORAL_LINK_TO_SPANS, 500 TEMPORAL_LINK_TO_RELATION, 501 null); 502 503 private static final Model<TemporalLink> TLINK_EVENT_SENTTIME_MODEL = new Model<TemporalLink>( 504 "tlink-event-senttime", 505 Lists.<Model<?>> newArrayList( 506 TIME_EXTENT_MODEL, 507 TIME_TYPE_MODEL, 508 EVENT_EXTENT_MODEL, 509 EVENT_ASPECT_MODEL, 510 EVENT_CLASS_MODEL, 511 EVENT_MODALITY_MODEL, 512 EVENT_POLARITY_MODEL, 513 EVENT_TENSE_MODEL), 514 TemporalLinkEventToSameSentenceTimeAnnotator.class, 515 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.5", "-s", "1"), 516 CLASSIFIER_PARAM_SEARCH_SPACE, 517 Model.EvaluationType.INTERSECTED_SPANS, 518 Model.LoggingType.NONE, 519 TemporalLink.class, 520 TEMPORAL_LINK_TO_SPANS, 521 TEMPORAL_LINK_TO_RELATION, 522 null); 523 524 private static final Model<TemporalLink> TLINK_EVENT_SUBORDEVENT_MODEL = new Model<TemporalLink>( 525 "tlink-event-subordevent", 526 Lists.<Model<?>> newArrayList( 527 EVENT_EXTENT_MODEL, 528 EVENT_ASPECT_MODEL, 529 EVENT_CLASS_MODEL, 530 EVENT_MODALITY_MODEL, 531 EVENT_POLARITY_MODEL, 532 EVENT_TENSE_MODEL), 533 TemporalLinkEventToSubordinatedEventAnnotator.class, 534 new Model.Params(LibLinearStringOutcomeDataWriter.class, "-c", "0.1", "-s", "1"), 535 CLASSIFIER_PARAM_SEARCH_SPACE, 536 Model.EvaluationType.INTERSECTED_SPANS, 537 Model.LoggingType.NONE, 538 TemporalLink.class, 539 TEMPORAL_LINK_TO_SPANS, 540 TEMPORAL_LINK_TO_RELATION, 541 null); 542 543 private ImmutableMultimap<Model<?>, Model.Params> models; 544 545 private List<File> inferredTLinksDirectories; 546 547 private boolean useVerbClauseTlinks; 548 549 private boolean relationsOnly; 550 551 public TempEval2013Evaluation( 552 File baseDirectory, 553 ImmutableMultimap<Model<?>, Model.Params> models, 554 List<File> inferredTLinksDirectories, 555 boolean useVerbClauseTlinks, 556 boolean relationsOnly) { 557 super(baseDirectory); 558 this.models = models; 559 this.inferredTLinksDirectories = inferredTLinksDirectories; 560 this.useVerbClauseTlinks = useVerbClauseTlinks; 561 this.relationsOnly = relationsOnly; 562 } 563 564 @Override 565 protected CollectionReader getCollectionReader(List<File> files) throws Exception { 566 return UriCollectionReader.getCollectionReaderFromFiles(files); 567 } 568 569 @Override 570 public void train(CollectionReader reader, File directory) throws Exception { 571 AggregateBuilder builder = new AggregateBuilder(); 572 573 // read the manual TimeML annotations into the CAS 574 builder.add(AnalysisEngineFactory.createEngineDescription( 575 ViewCreatorAnnotator.class, 576 ViewCreatorAnnotator.PARAM_VIEW_NAME, 577 TimeMlGoldAnnotator.TIMEML_VIEW_NAME)); 578 builder.add( 579 UriToDocumentTextAnnotator.getDescription(), 580 CAS.NAME_DEFAULT_SOFA, 581 TimeMlGoldAnnotator.TIMEML_VIEW_NAME); 582 builder.add(TimeMlGoldAnnotator.getDescription()); 583 if (this.inferredTLinksDirectories != null) { 584 builder.add(AnalysisEngineFactory.createEngineDescription( 585 UseInferredTlinks.class, 586 UseInferredTlinks.PARAM_INFERRED_TLINKS_DIRECTORIES, 587 this.inferredTLinksDirectories)); 588 } 589 if (this.useVerbClauseTlinks) { 590 builder.add(PlainTextTlinkGoldAnnotator.getDescription()); 591 } 592 builder.add(AnalysisEngineFactory.createEngineDescription(FixTimeML.class)); 593 594 // only add sentences and other annotations under <TEXT> 595 builder.add(AnalysisEngineFactory.createEngineDescription( 596 SentenceAnnotator.class, 597 SentenceAnnotator.PARAM_SENTENCE_MODEL_PATH, 598 "/models/en-sent.bin", 599 SentenceAnnotator.PARAM_WINDOW_CLASS_NAMES, 600 new Class<?>[] { Text.class })); 601 builder.add(TokenAnnotator.getDescription()); 602 builder.add(PosTaggerAnnotator.getDescription()); 603 builder.add(DefaultSnowballStemmer.getDescription("English")); 604 builder.add(ParserAnnotator.getDescription()); 605 606 // add a data write for each model and its various parameters 607 for (Model<?> model : this.models.keySet()) { 608 for (Model.Params params : this.models.get(model)) { 609 builder.add(model.getWriterDescription(directory, params)); 610 } 611 } 612 613 // run the pipeline 614 SimplePipeline.runPipeline(reader, builder.createAggregate()); 615 616 // train each model with each of its various parameters 617 for (Model<?> model : this.models.keySet()) { 618 for (Model.Params params : this.models.get(model)) { 619 System.err.printf("Training: %s %s\n", model.name, params); 620 model.train(directory, params); 621 } 622 } 623 } 624 625 @Override 626 protected ImmutableTable<Model<?>, Model.Params, AnnotationStatistics<String>> test( 627 CollectionReader reader, 628 File directory) throws Exception { 629 String goldViewName = "GoldView"; 630 AggregateBuilder preprocess = new AggregateBuilder(); 631 632 // read the manual TimeML annotations into the gold view 633 preprocess.add(AnalysisEngineFactory.createEngineDescription( 634 ViewCreatorAnnotator.class, 635 ViewCreatorAnnotator.PARAM_VIEW_NAME, 636 TimeMlGoldAnnotator.TIMEML_VIEW_NAME)); 637 preprocess.add( 638 UriToDocumentTextAnnotator.getDescription(), 639 CAS.NAME_DEFAULT_SOFA, 640 TimeMlGoldAnnotator.TIMEML_VIEW_NAME); 641 preprocess.add(AnalysisEngineFactory.createEngineDescription( 642 ViewCreatorAnnotator.class, 643 ViewCreatorAnnotator.PARAM_VIEW_NAME, 644 goldViewName)); 645 preprocess.add(TimeMlGoldAnnotator.getDescription(), CAS.NAME_DEFAULT_SOFA, goldViewName); 646 if (this.inferredTLinksDirectories != null) { 647 preprocess.add(AnalysisEngineFactory.createEngineDescription( 648 UseInferredTlinks.class, 649 UseInferredTlinks.PARAM_INFERRED_TLINKS_DIRECTORIES, 650 this.inferredTLinksDirectories), CAS.NAME_DEFAULT_SOFA, goldViewName); 651 } 652 if (this.useVerbClauseTlinks) { 653 preprocess.add( 654 PlainTextTlinkGoldAnnotator.getDescription(), 655 CAS.NAME_DEFAULT_SOFA, 656 goldViewName); 657 } 658 preprocess.add( 659 AnalysisEngineFactory.createEngineDescription(FixTimeML.class), 660 CAS.NAME_DEFAULT_SOFA, 661 goldViewName); 662 preprocess.add(AnalysisEngineFactory.createEngineDescription( 663 CopyTextAndDocumentCreationTime.class, 664 CopyTextAndDocumentCreationTime.PARAM_SOURCE_VIEW, 665 goldViewName)); 666 if (this.relationsOnly) { 667 preprocess.add(AnalysisEngineFactory.createEngineDescription( 668 CopyEventsAndTimes.class, 669 CopyEventsAndTimes.PARAM_SOURCE_VIEW, 670 goldViewName)); 671 } 672 673 // only add sentences and other annotations under <TEXT> 674 preprocess.add(AnalysisEngineFactory.createEngineDescription( 675 SentenceAnnotator.class, 676 SentenceAnnotator.PARAM_SENTENCE_MODEL_PATH, 677 "/models/en-sent.bin", 678 SentenceAnnotator.PARAM_WINDOW_CLASS_NAMES, 679 new Class<?>[] { Text.class })); 680 preprocess.add(TokenAnnotator.getDescription()); 681 preprocess.add(PosTaggerAnnotator.getDescription()); 682 preprocess.add(DefaultSnowballStemmer.getDescription("English")); 683 preprocess.add(ParserAnnotator.getDescription()); 684 AnalysisEngine preprocessEngine = preprocess.createAggregate(); 685 686 // finalize TLINK ids and write out TimeML files 687 AggregateBuilder postprocess = new AggregateBuilder(); 688 postprocess.add(AnalysisEngineFactory.createEngineDescription(ShrinkTimesContainingEvents.class)); 689 postprocess.add(AnalysisEngineFactory.createEngineDescription(SetTemporalLinkIDs.class)); 690 postprocess.add(TempEval2013Writer.getDescription(new File(this.baseDirectory, "timeml"))); 691 AnalysisEngine postprocessEngine = postprocess.createAggregate(); 692 693 // create one AnalysisEngine and one AnnotationStatistics for each model/parameters combination 694 ImmutableTable.Builder<Model<?>, Model.Params, AnalysisEngine> enginesBuilder = ImmutableTable.builder(); 695 ImmutableTable.Builder<Model<?>, Model.Params, AnnotationStatistics<String>> statsBuilder = ImmutableTable.builder(); 696 for (Model<?> model : this.models.keySet()) { 697 for (Model.Params params : this.models.get(model)) { 698 AnalysisEngineDescription desc = model.getAnnotatorDescription(directory, params); 699 enginesBuilder.put(model, params, AnalysisEngineFactory.createEngine(desc)); 700 statsBuilder.put(model, params, new AnnotationStatistics<String>()); 701 } 702 } 703 ImmutableTable<Model<?>, Model.Params, AnalysisEngine> engines = enginesBuilder.build(); 704 ImmutableTable<Model<?>, Model.Params, AnnotationStatistics<String>> stats = statsBuilder.build(); 705 706 // evaluate each CAS in the test data 707 JCasIterator iter = new JCasIterator(reader, preprocessEngine); 708 while (iter.hasNext()) { 709 JCas jCas = iter.next(); 710 711 // evaluate each model/parameters combination 712 JCas goldView = jCas.getView(goldViewName); 713 JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA); 714 for (Model<?> model : engines.rowKeySet()) { 715 // remove any annotations from previous models that would interfere with the evaluation 716 Map<? extends TOP, String> annotations = model.removeModelAnnotations(jCas); 717 718 // apply and evaluate the model with each set of parameters 719 for (Map.Entry<Model.Params, AnalysisEngine> entry : engines.row(model).entrySet()) { 720 Model.Params params = entry.getKey(); 721 AnalysisEngine engine = entry.getValue(); 722 723 // remove any annotations from this model with other parameter settings 724 model.removeModelAnnotations(jCas); 725 726 // process and evaluate 727 engine.process(jCas); 728 model.evaluate(goldView, systemView, stats.get(model, params)); 729 } 730 731 // restore any annotations from previous models 732 model.restoreModelAnnotations(jCas, annotations); 733 } 734 735 postprocessEngine.process(jCas); 736 } 737 return stats; 738 } 739 740 public static class CopyTextAndDocumentCreationTime extends JCasAnnotator_ImplBase { 741 742 public static final String PARAM_SOURCE_VIEW = "SourceView"; 743 744 @ConfigurationParameter(name = PARAM_SOURCE_VIEW) 745 private String sourceViewName; 746 747 @Override 748 public void process(JCas jCas) throws AnalysisEngineProcessException { 749 JCas sourceView; 750 try { 751 sourceView = jCas.getView(this.sourceViewName); 752 } catch (CASException e) { 753 throw new AnalysisEngineProcessException(e); 754 } 755 CasCopier copier = new CasCopier(sourceView.getCas(), jCas.getCas()); 756 Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA); 757 758 // copy document text 759 jCas.setDocumentText(sourceView.getDocumentText()); 760 761 // copy text annotation 762 Text sourceText = JCasUtil.selectSingle(sourceView, Text.class); 763 Text text = (Text) copier.copyFs(sourceText); 764 text.setFeatureValue(sofaFeature, jCas.getSofa()); 765 text.addToIndexes(); 766 767 // copy document creation time 768 DocumentCreationTime sourceTime = JCasUtil.selectSingle( 769 sourceView, 770 DocumentCreationTime.class); 771 DocumentCreationTime time = (DocumentCreationTime) copier.copyFs(sourceTime); 772 time.setFeatureValue(sofaFeature, jCas.getSofa()); 773 time.addToIndexes(); 774 } 775 } 776 777 public static class CopyEventsAndTimes extends JCasAnnotator_ImplBase { 778 779 public static final String PARAM_SOURCE_VIEW = "SourceView"; 780 781 @ConfigurationParameter(name = PARAM_SOURCE_VIEW) 782 private String sourceViewName; 783 784 @Override 785 public void process(JCas jCas) throws AnalysisEngineProcessException { 786 JCas sourceView; 787 try { 788 sourceView = jCas.getView(this.sourceViewName); 789 } catch (CASException e) { 790 throw new AnalysisEngineProcessException(e); 791 } 792 CasCopier copier = new CasCopier(sourceView.getCas(), jCas.getCas()); 793 Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA); 794 for (Event sourceEvent : JCasUtil.select(sourceView, Event.class)) { 795 Event event = (Event) copier.copyFs(sourceEvent); 796 event.setFeatureValue(sofaFeature, jCas.getSofa()); 797 if (event.getEventInstanceID() == null) { 798 event.setEventInstanceID(event.getId().replaceAll("^e", "ei")); 799 } 800 event.addToIndexes(); 801 } 802 for (Time sourceTime : JCasUtil.select(sourceView, Time.class)) { 803 if (!(sourceTime instanceof DocumentCreationTime)) { 804 Time time = (Time) copier.copyFs(sourceTime); 805 time.setFeatureValue(sofaFeature, jCas.getSofa()); 806 time.addToIndexes(); 807 } 808 } 809 } 810 } 811 812 public static class FixTimeML extends JCasAnnotator_ImplBase { 813 814 private static Set<String> IS_SIMULTANEOUS = Sets.newHashSet("DURING", "DURING_INV", "IDENTITY"); 815 816 @Override 817 public void process(JCas jCas) throws AnalysisEngineProcessException { 818 // add missing event attributes 819 for (Event event : JCasUtil.select(jCas, Event.class)) { 820 if (event.getAspect() == null) { 821 event.setAspect("NONE"); 822 } 823 if (event.getModality() == null) { 824 event.setModality("none"); 825 } 826 String modality = event.getModality(); 827 event.setModality(modality.toLowerCase().replaceAll("_", " ").replaceAll("^'d$", "would")); 828 if (event.getPolarity() == null) { 829 event.setPolarity("POS"); 830 } 831 if (event.getTense() == null) { 832 event.setTense("NONE"); 833 } 834 } 835 836 // simplify simultaneous relations 837 for (TemporalLink tlink : JCasUtil.select(jCas, TemporalLink.class)) { 838 if (IS_SIMULTANEOUS.contains(tlink.getRelationType())) { 839 tlink.setRelationType("SIMULTANEOUS"); 840 } 841 } 842 843 // remove overlap relations 844 for (TemporalLink tlink : Lists.newArrayList(JCasUtil.select(jCas, TemporalLink.class))) { 845 if ("OVERLAP".equals(tlink.getRelationType())) { 846 tlink.removeFromIndexes(); 847 } 848 } 849 } 850 } 851 852 public static class SetTemporalLinkIDs extends JCasAnnotator_ImplBase { 853 854 @Override 855 public void process(JCas jCas) throws AnalysisEngineProcessException { 856 int index = 1; 857 for (TemporalLink tlink : JCasUtil.select(jCas, TemporalLink.class)) { 858 tlink.setId(String.format("l%d", index)); 859 ++index; 860 } 861 862 } 863 } 864 865 public static class ShrinkTimesContainingEvents extends JCasAnnotator_ImplBase { 866 867 @Override 868 public void process(JCas jCas) throws AnalysisEngineProcessException { 869 String text = jCas.getDocumentText(); 870 for (Time time : JCasUtil.select(jCas, Time.class)) { 871 List<Event> events = JCasUtil.selectCovered(jCas, Event.class, time); 872 if (!events.isEmpty()) { 873 int eventsBegin = events.get(0).getBegin(); 874 int eventsEnd = events.get(events.size() - 1).getEnd(); 875 int timeBegin, timeEnd; 876 if (time.getBegin() - eventsBegin > time.getEnd() - eventsEnd) { 877 timeBegin = time.getBegin(); 878 timeEnd = eventsBegin - 1; 879 while (timeEnd > timeBegin && Character.isWhitespace(text.charAt(timeEnd))) { 880 --timeEnd; 881 } 882 } else { 883 timeEnd = time.getEnd(); 884 timeBegin = eventsEnd; 885 while (timeBegin < timeEnd && Character.isWhitespace(text.charAt(timeBegin))) { 886 ++timeBegin; 887 } 888 } 889 String oldText = time.getCoveredText(); 890 time.setBegin(timeBegin); 891 time.setEnd(timeEnd); 892 String newText = time.getCoveredText(); 893 this.getLogger().warn(String.format("shrinking \"%s\" to \"%s\"", oldText, newText)); 894 } 895 } 896 897 } 898 } 899 900 public static class UseInferredTlinks extends JCasAnnotator_ImplBase { 901 902 public static final String PARAM_INFERRED_TLINKS_DIRECTORIES = "InferredTLinksDirectories"; 903 904 @ConfigurationParameter(name = PARAM_INFERRED_TLINKS_DIRECTORIES, mandatory = true) 905 private List<File> inferredTLinksDirectories; 906 private Map<String, File> fileNameToFile; 907 908 909 @Override 910 public void initialize(UimaContext context) throws ResourceInitializationException { 911 super.initialize(context); 912 this.fileNameToFile = Maps.newHashMap(); 913 for (File dir : this.inferredTLinksDirectories) { 914 for (File file : dir.listFiles()) { 915 String fileName = file.getName(); 916 if (fileName.endsWith(".tml")) { 917 String extension = String.format("[.]%s[.]tml$", dir.getName()); 918 this.fileNameToFile.put(fileName.replaceAll(extension, ".tml"), file); 919 } 920 } 921 } 922 } 923 924 @Override 925 public void process(JCas jCas) throws AnalysisEngineProcessException { 926 String fileName = new File(ViewUriUtil.getURI(jCas).getPath()).getName(); 927 File inferredTLinksFile = this.fileNameToFile.get(fileName); 928 929 if (inferredTLinksFile == null) { 930 this.getLogger().warn("No inferred TLINKs found for " + fileName); 931 } else { 932 933 // remove existing temporal links 934 for (TemporalLink tlink : Lists.newArrayList(JCasUtil.select(jCas, TemporalLink.class))) { 935 tlink.removeFromIndexes(); 936 } 937 938 // parse the XML document 939 SAXBuilder builder = new SAXBuilder(); 940 Document xml; 941 try { 942 xml = builder.build(inferredTLinksFile); 943 } catch (JDOMException e) { 944 throw new AnalysisEngineProcessException(e); 945 } catch (IOException e) { 946 throw new AnalysisEngineProcessException(e); 947 } 948 949 // index all anchors by their IDs 950 Map<String, Anchor> idToAnchor = Maps.newHashMap(); 951 for (Anchor anchor : JCasUtil.select(jCas, Anchor.class)) { 952 idToAnchor.put(anchor.getId(), anchor); 953 if (anchor instanceof Event) { 954 idToAnchor.put(((Event) anchor).getEventInstanceID(), anchor); 955 } 956 } 957 958 // create a TemporalLink for each TLINK in the file 959 int offset = jCas.getDocumentText().length(); 960 for (Element linkElem : xml.getDescendants(Filters.element("TLINK"))) { 961 // get the relation 962 String relationType = linkElem.getAttributeValue("relType"); 963 if (relationType == null) { 964 error(jCas, linkElem, "No relation type specified in %s"); 965 } 966 967 // get the source 968 String sourceEventID = linkElem.getAttributeValue("eventInstanceID"); 969 String sourceTimeID = linkElem.getAttributeValue("timeID"); 970 if (!(sourceEventID == null ^ sourceTimeID == null)) { 971 error(jCas, linkElem, "Expected exactly 1 source attribute, found %s"); 972 } 973 String sourceID = sourceEventID != null ? sourceEventID : sourceTimeID; 974 Anchor source = idToAnchor.get(sourceID); 975 if (source == null) { 976 this.getLogger().warn( 977 errorString(jCas, linkElem, "No annotation found for source of %s")); 978 continue; 979 } 980 981 // get the target 982 String targetEventID = linkElem.getAttributeValue("relatedToEventInstance"); 983 String targetTimeID = linkElem.getAttributeValue("relatedToTime"); 984 if (!(targetEventID == null ^ targetTimeID == null)) { 985 error(jCas, linkElem, "Expected exactly 1 target attribute, found %s"); 986 } 987 String targetID = targetEventID != null ? targetEventID : targetTimeID; 988 Anchor target = idToAnchor.get(targetID); 989 if (target == null) { 990 this.getLogger().warn( 991 errorString(jCas, linkElem, "No annotation found for target of %s")); 992 continue; 993 } 994 995 // add the temporal link 996 TemporalLink link = new TemporalLink(jCas, offset, offset); 997 link.setRelationType(relationType); 998 link.setSource(source); 999 link.setTarget(target); 1000 link.addToIndexes(); 1001 } 1002 } 1003 } 1004 1005 private static String errorString(JCas jCas, Element element, String message) 1006 throws AnalysisEngineProcessException { 1007 URI uri = ViewUriUtil.getURI(jCas); 1008 String elemString = new XMLOutputter().outputString(element); 1009 return String.format("In %s: " + message, uri, elemString); 1010 } 1011 1012 private static void error(JCas jCas, Element element, String message) 1013 throws AnalysisEngineProcessException { 1014 throw new IllegalArgumentException(errorString(jCas, element, message)); 1015 } 1016 } 1017}