001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.timeml.tlink; 025 026import java.util.ArrayList; 027import java.util.Arrays; 028import java.util.HashMap; 029import java.util.HashSet; 030import java.util.List; 031import java.util.Map; 032import java.util.Set; 033 034import org.apache.uima.analysis_engine.AnalysisEngineDescription; 035import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 036import org.apache.uima.jcas.JCas; 037import org.apache.uima.jcas.tcas.Annotation; 038import org.apache.uima.resource.ResourceInitializationException; 039import org.apache.uima.util.Level; 040import org.cleartk.feature.syntax.TargetPathExtractor; 041import org.cleartk.feature.token.TokenTextForSelectedPosExtractor; 042import org.cleartk.ml.CleartkAnnotator; 043import org.cleartk.ml.Instance; 044import org.cleartk.ml.feature.extractor.CleartkExtractor; 045import org.cleartk.ml.feature.extractor.CoveredTextExtractor; 046import org.cleartk.ml.feature.extractor.FeatureExtractor1; 047import org.cleartk.ml.feature.extractor.NamingExtractor1; 048import org.cleartk.ml.feature.extractor.TypePathExtractor; 049import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag; 050import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered; 051import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding; 052import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter; 053import org.cleartk.syntax.constituent.type.TopTreebankNode; 054import org.cleartk.syntax.constituent.type.TreebankNode; 055import org.cleartk.timeml.type.Anchor; 056import org.cleartk.timeml.type.Event; 057import org.cleartk.timeml.type.TemporalLink; 058import org.cleartk.timeml.util.CleartkInternalModelFactory; 059import org.cleartk.token.type.Sentence; 060import org.cleartk.token.type.Token; 061import org.cleartk.util.AnnotationUtil; 062import org.apache.uima.fit.descriptor.ConfigurationParameter; 063import org.apache.uima.fit.descriptor.TypeCapability; 064import org.apache.uima.fit.factory.AnalysisEngineFactory; 065import org.apache.uima.fit.util.JCasUtil; 066 067import com.google.common.collect.Lists; 068 069/** 070 * <br> 071 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 072 * All rights reserved. 073 * 074 * 075 * 076 * @author Steven Bethard 077 */ 078@TypeCapability( 079 outputs = { "org.cleartk.timeml.type.TemporalLink", "org.cleartk.timeml.type.Event" }) 080public class VerbClauseTemporalAnnotator extends CleartkAnnotator<String> { 081 082 public static final CleartkInternalModelFactory FACTORY = new CleartkInternalModelFactory() { 083 @Override 084 public Class<?> getAnnotatorClass() { 085 return VerbClauseTemporalAnnotator.class; 086 } 087 088 @Override 089 public Class<?> getDataWriterClass() { 090 return LibLinearStringOutcomeDataWriter.class; 091 } 092 093 @Override 094 public AnalysisEngineDescription getBaseDescription() throws ResourceInitializationException { 095 return AnalysisEngineFactory.createEngineDescription(VerbClauseTemporalAnnotator.class); 096 } 097 }; 098 099 private static final Map<String, String[]> headMap = new HashMap<String, String[]>(); 100 static { 101 headMap.put("S", "VP S SBAR ADJP".split(" ")); 102 headMap.put("SBAR", "VP S SBAR ADJP".split(" ")); 103 headMap.put("VP", ("VP VB VBZ VBP VBG VBN VBD JJ JJR JJS " 104 + "NNS NN PRP NNPS NNP ADJP NP S SBAR").split(" ")); 105 headMap.put("ADJP", "ADJP VB VBZ VBP VBG VBN VBD JJ JJR JJS".split(" ")); 106 headMap.put("NP", "NP NNS NN PRP NNPS NNP QP ADJP".split(" ")); 107 headMap.put("QP", "NP NNS NN PRP NNPS NNP QP ADJP".split(" ")); 108 } 109 110 private static final Set<String> stopWords = new HashSet<String>( 111 Arrays.asList("be been is 's am are was were has had have".split(" "))); 112 113 private List<FeatureExtractor1<Token>> sourceFeatureExtractors; 114 115 private List<FeatureExtractor1<Token>> targetFeatureExtractors; 116 117 private List<FeatureExtractor1<Annotation>> betweenAnchorsFeatureExtractors; 118 119 private TargetPathExtractor pathExtractor; 120 121 private int eventID; 122 123 @ConfigurationParameter( 124 name = PARAM_CREATE_EVENTS, 125 defaultValue = "false", description = "Create events for all verbs in " 126 + "verb-clause relations (using existing events if present, but adding new ones " 127 + "wherever they are not present).") 128 private boolean createEvents; 129 130 public static final String PARAM_CREATE_EVENTS = "createEvents"; 131 132 public VerbClauseTemporalAnnotator() { 133 this.eventID = 1; 134 135 FeatureExtractor1<Token> precedingAuxiliaries = new CleartkExtractor<Token, Token>( 136 Token.class, 137 new TokenTextForSelectedPosExtractor("MD", "TO", "IN", "VB", "RB"), 138 new Preceding(3)); 139 FeatureExtractor1<Token> tokenStemExtractor = new TypePathExtractor<Token>(Token.class, "stem"); 140 FeatureExtractor1<Token> tokenPOSExtractor = new TypePathExtractor<Token>(Token.class, "pos"); 141 142 this.sourceFeatureExtractors = Lists.newArrayList(); 143 this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", new CoveredTextExtractor<Token>())); 144 this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", tokenPOSExtractor)); 145 this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", tokenStemExtractor)); 146 this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", precedingAuxiliaries)); 147 148 this.targetFeatureExtractors = Lists.newArrayList(); 149 this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", new CoveredTextExtractor<Token>())); 150 this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", tokenPOSExtractor)); 151 this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", tokenStemExtractor)); 152 this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", precedingAuxiliaries)); 153 154 this.betweenAnchorsFeatureExtractors = new ArrayList<FeatureExtractor1<Annotation>>(); 155 this.betweenAnchorsFeatureExtractors.add(new NamingExtractor1<Annotation>( 156 "WordsBetween", 157 new CleartkExtractor<Annotation, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered())))); 158 this.pathExtractor = new TargetPathExtractor(); 159 } 160 161 public void process(JCas jCas) throws AnalysisEngineProcessException { 162 int docEnd = jCas.getDocumentText().length(); 163 164 // collect TLINKs if necessary 165 Map<String, TemporalLink> tlinks = null; 166 if (this.isTraining()) { 167 tlinks = this.getTemporalLinks(jCas); 168 } 169 170 // look for verb-clause pairs in each sentence in the document 171 for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { 172 TopTreebankNode tree = AnnotationUtil.selectFirstMatching( 173 jCas, 174 TopTreebankNode.class, 175 sentence); 176 if (tree == null) { 177 String fmt = "missing syntactic parse for sentence: %s"; 178 String msg = String.format(fmt, sentence.getCoveredText()); 179 this.getContext().getLogger().log(Level.WARNING, msg); 180 continue; 181 } 182 183 // iterate over all verb-clause pairs 184 List<TreebankNodeLink> links = new ArrayList<TreebankNodeLink>(); 185 this.collectVerbClausePairs(tree, links); 186 for (TreebankNodeLink link : links) { 187 188 Token sourceToken = JCasUtil.selectCovered(jCas, Token.class, link.source).get(0); 189 Token targetToken = JCasUtil.selectCovered(jCas, Token.class, link.target).get(0); 190 int firstEnd = Math.min(sourceToken.getEnd(), targetToken.getEnd()); 191 int lastBegin = Math.max(sourceToken.getBegin(), targetToken.getBegin()); 192 193 // create an instance and populate it with features 194 Instance<String> instance = new Instance<String>(); 195 for (FeatureExtractor1<Token> extractor : this.sourceFeatureExtractors) { 196 instance.addAll(extractor.extract(jCas, sourceToken)); 197 } 198 for (FeatureExtractor1<Token> extractor : this.targetFeatureExtractors) { 199 instance.addAll(extractor.extract(jCas, targetToken)); 200 } 201 Annotation windowAnnotation = new Annotation(jCas, firstEnd, lastBegin); 202 for (FeatureExtractor1<Annotation> extractor : this.betweenAnchorsFeatureExtractors) { 203 instance.addAll(extractor.extract(jCas, windowAnnotation)); 204 } 205 instance.addAll(this.pathExtractor.extract(jCas, link.source, link.target)); 206 207 // find source and target anchors if they're available 208 Anchor source = AnnotationUtil.selectFirstMatching(jCas, Anchor.class, link.source); 209 Anchor target = AnnotationUtil.selectFirstMatching(jCas, Anchor.class, link.target); 210 211 // if we're building training data, get the relation type from a 212 // TLINK 213 if (this.isTraining()) { 214 if (source != null && target != null) { 215 String key = String.format("%s:%s", source.getId(), target.getId()); 216 TemporalLink tlink = tlinks.remove(key); 217 if (tlink != null) { 218 instance.setOutcome(tlink.getRelationType()); 219 this.dataWriter.write(instance); 220 } 221 } 222 } 223 224 // if we're classifying create new TLINKs from the 225 // classification outcomes 226 else { 227 source = this.getOrCreateEvent(jCas, source, link.source); 228 target = this.getOrCreateEvent(jCas, target, link.target); 229 // only create TLINKs for events that exist (or were created, if requested) 230 if (source != null && target != null) { 231 String relationType = this.classifier.classify(instance.getFeatures()); 232 TemporalLink tlink = new TemporalLink(jCas, docEnd, docEnd); 233 tlink.setSource(source); 234 tlink.setTarget(target); 235 tlink.setRelationType(relationType); 236 tlink.addToIndexes(); 237 } 238 } 239 } 240 } 241 } 242 243 private Event getOrCreateEvent(JCas jCas, Anchor anchor, TreebankNode node) { 244 if (anchor != null && anchor instanceof Event) { 245 return (Event) anchor; 246 } else if (this.createEvents) { 247 Event event = new Event(jCas, node.getBegin(), node.getEnd()); 248 event.setId("e" + this.eventID); 249 this.eventID++; 250 event.addToIndexes(); 251 return event; 252 } else { 253 return null; 254 } 255 } 256 257 private Map<String, TemporalLink> getTemporalLinks(JCas jCas) { 258 Map<String, TemporalLink> tlinks = new HashMap<String, TemporalLink>(); 259 for (TemporalLink tlink : JCasUtil.select(jCas, TemporalLink.class)) { 260 String sourceID = tlink.getSource().getId(); 261 String targetID = tlink.getTarget().getId(); 262 String key = String.format("%s:%s", sourceID, targetID); 263 tlinks.put(key, tlink); 264 } 265 return tlinks; 266 } 267 268 private void collectVerbClausePairs(TreebankNode node, List<TreebankNodeLink> links) { 269 if (this.isVerbPhrase(node)) { 270 List<TreebankNode> sources = new ArrayList<TreebankNode>(); 271 List<TreebankNode> targets = new ArrayList<TreebankNode>(); 272 this.collectHeads(node, sources); 273 274 // look for clauses in descendants 275 for (int i = 0; i < node.getChildren().size(); i++) { 276 TreebankNode child = node.getChildren(i); 277 if (this.isClause(child)) { 278 279 // pair the verb phrase heads with the clause heads 280 targets.clear(); 281 this.collectHeads(child, targets); 282 for (TreebankNode source : sources) { 283 for (TreebankNode target : targets) { 284 285 // skip pairs where the head of the VP is inside the 286 // clause 287 if (!this.contains(child, source)) { 288 links.add(new TreebankNodeLink(source, target)); 289 } 290 } 291 } 292 } 293 } 294 } 295 // look for verb phrases in descendants 296 for (int i = 0; i < node.getChildren().size(); i++) { 297 TreebankNode child = node.getChildren(i); 298 this.collectVerbClausePairs(child, links); 299 } 300 } 301 302 private void collectHeads(TreebankNode node, List<TreebankNode> heads) { 303 if (node.getLeaf()) { 304 heads.add(node); 305 } 306 String[] headTypes = VerbClauseTemporalAnnotator.headMap.get(node.getNodeType()); 307 if (headTypes != null) { 308 for (String headType : headTypes) { 309 boolean foundChildWithHeadType = false; 310 for (int i = 0; i < node.getChildren().size(); i++) { 311 TreebankNode child = node.getChildren(i); 312 if (child.getNodeType().equals(headType)) { 313 String text = child.getCoveredText(); 314 if (!VerbClauseTemporalAnnotator.stopWords.contains(text)) { 315 this.collectHeads(child, heads); 316 foundChildWithHeadType = true; 317 } 318 } 319 } 320 if (foundChildWithHeadType) { 321 break; 322 } 323 } 324 } 325 } 326 327 private boolean contains(TreebankNode node, TreebankNode descendant) { 328 if (node == descendant) { 329 return true; 330 } 331 for (int i = 0; i < node.getChildren().size(); i++) { 332 boolean result = this.contains(node.getChildren(i), descendant); 333 if (result) { 334 return true; 335 } 336 } 337 return false; 338 } 339 340 private boolean isVerbPhrase(TreebankNode node) { 341 return node.getNodeType().startsWith("VP"); 342 } 343 344 private boolean isClause(TreebankNode node) { 345 return node.getNodeType().startsWith("S"); 346 } 347 348 private class TreebankNodeLink { 349 public TreebankNode source; 350 351 public TreebankNode target; 352 353 public TreebankNodeLink(TreebankNode source, TreebankNode target) { 354 this.source = source; 355 this.target = target; 356 } 357 } 358 359}