001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.timeml.tlink;
025
026import java.util.ArrayList;
027import java.util.Arrays;
028import java.util.HashMap;
029import java.util.HashSet;
030import java.util.List;
031import java.util.Map;
032import java.util.Set;
033
034import org.apache.uima.analysis_engine.AnalysisEngineDescription;
035import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
036import org.apache.uima.jcas.JCas;
037import org.apache.uima.jcas.tcas.Annotation;
038import org.apache.uima.resource.ResourceInitializationException;
039import org.apache.uima.util.Level;
040import org.cleartk.feature.syntax.TargetPathExtractor;
041import org.cleartk.feature.token.TokenTextForSelectedPosExtractor;
042import org.cleartk.ml.CleartkAnnotator;
043import org.cleartk.ml.Instance;
044import org.cleartk.ml.feature.extractor.CleartkExtractor;
045import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
046import org.cleartk.ml.feature.extractor.FeatureExtractor1;
047import org.cleartk.ml.feature.extractor.NamingExtractor1;
048import org.cleartk.ml.feature.extractor.TypePathExtractor;
049import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
050import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
051import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
052import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
053import org.cleartk.syntax.constituent.type.TopTreebankNode;
054import org.cleartk.syntax.constituent.type.TreebankNode;
055import org.cleartk.timeml.type.Anchor;
056import org.cleartk.timeml.type.Event;
057import org.cleartk.timeml.type.TemporalLink;
058import org.cleartk.timeml.util.CleartkInternalModelFactory;
059import org.cleartk.token.type.Sentence;
060import org.cleartk.token.type.Token;
061import org.cleartk.util.AnnotationUtil;
062import org.apache.uima.fit.descriptor.ConfigurationParameter;
063import org.apache.uima.fit.descriptor.TypeCapability;
064import org.apache.uima.fit.factory.AnalysisEngineFactory;
065import org.apache.uima.fit.util.JCasUtil;
066
067import com.google.common.collect.Lists;
068
069/**
070 * <br>
071 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
072 * All rights reserved.
073 * 
074 * 
075 * 
076 * @author Steven Bethard
077 */
078@TypeCapability(
079    outputs = { "org.cleartk.timeml.type.TemporalLink", "org.cleartk.timeml.type.Event" })
080public class VerbClauseTemporalAnnotator extends CleartkAnnotator<String> {
081
082  public static final CleartkInternalModelFactory FACTORY = new CleartkInternalModelFactory() {
083    @Override
084    public Class<?> getAnnotatorClass() {
085      return VerbClauseTemporalAnnotator.class;
086    }
087
088    @Override
089    public Class<?> getDataWriterClass() {
090      return LibLinearStringOutcomeDataWriter.class;
091    }
092
093    @Override
094    public AnalysisEngineDescription getBaseDescription() throws ResourceInitializationException {
095      return AnalysisEngineFactory.createEngineDescription(VerbClauseTemporalAnnotator.class);
096    }
097  };
098
099  private static final Map<String, String[]> headMap = new HashMap<String, String[]>();
100  static {
101    headMap.put("S", "VP S SBAR ADJP".split(" "));
102    headMap.put("SBAR", "VP S SBAR ADJP".split(" "));
103    headMap.put("VP", ("VP VB VBZ VBP VBG VBN VBD JJ JJR JJS "
104        + "NNS NN PRP NNPS NNP ADJP NP S SBAR").split(" "));
105    headMap.put("ADJP", "ADJP VB VBZ VBP VBG VBN VBD JJ JJR JJS".split(" "));
106    headMap.put("NP", "NP NNS NN PRP NNPS NNP QP ADJP".split(" "));
107    headMap.put("QP", "NP NNS NN PRP NNPS NNP QP ADJP".split(" "));
108  }
109
110  private static final Set<String> stopWords = new HashSet<String>(
111      Arrays.asList("be been is 's am are was were has had have".split(" ")));
112
113  private List<FeatureExtractor1<Token>> sourceFeatureExtractors;
114
115  private List<FeatureExtractor1<Token>> targetFeatureExtractors;
116
117  private List<FeatureExtractor1<Annotation>> betweenAnchorsFeatureExtractors;
118
119  private TargetPathExtractor pathExtractor;
120
121  private int eventID;
122
123  @ConfigurationParameter(
124      name = PARAM_CREATE_EVENTS,
125      defaultValue = "false", description = "Create events for all verbs in "
126      + "verb-clause relations (using existing events if present, but adding new ones "
127      + "wherever they are not present).")
128  private boolean createEvents;
129
130  public static final String PARAM_CREATE_EVENTS = "createEvents";
131  
132  public VerbClauseTemporalAnnotator() {
133    this.eventID = 1;
134
135    FeatureExtractor1<Token> precedingAuxiliaries = new CleartkExtractor<Token, Token>(
136        Token.class,
137        new TokenTextForSelectedPosExtractor("MD", "TO", "IN", "VB", "RB"),
138        new Preceding(3));
139    FeatureExtractor1<Token> tokenStemExtractor = new TypePathExtractor<Token>(Token.class, "stem");
140    FeatureExtractor1<Token> tokenPOSExtractor = new TypePathExtractor<Token>(Token.class, "pos");
141
142    this.sourceFeatureExtractors = Lists.newArrayList();
143    this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", new CoveredTextExtractor<Token>()));
144    this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", tokenPOSExtractor));
145    this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", tokenStemExtractor));
146    this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", precedingAuxiliaries));
147
148    this.targetFeatureExtractors = Lists.newArrayList();
149    this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", new CoveredTextExtractor<Token>()));
150    this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", tokenPOSExtractor));
151    this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", tokenStemExtractor));
152    this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", precedingAuxiliaries));
153
154    this.betweenAnchorsFeatureExtractors = new ArrayList<FeatureExtractor1<Annotation>>();
155    this.betweenAnchorsFeatureExtractors.add(new NamingExtractor1<Annotation>(
156        "WordsBetween",
157        new CleartkExtractor<Annotation, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered()))));
158    this.pathExtractor = new TargetPathExtractor();
159  }
160
161  public void process(JCas jCas) throws AnalysisEngineProcessException {
162    int docEnd = jCas.getDocumentText().length();
163
164    // collect TLINKs if necessary
165    Map<String, TemporalLink> tlinks = null;
166    if (this.isTraining()) {
167      tlinks = this.getTemporalLinks(jCas);
168    }
169
170    // look for verb-clause pairs in each sentence in the document
171    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
172      TopTreebankNode tree = AnnotationUtil.selectFirstMatching(
173          jCas,
174          TopTreebankNode.class,
175          sentence);
176      if (tree == null) {
177        String fmt = "missing syntactic parse for sentence: %s";
178        String msg = String.format(fmt, sentence.getCoveredText());
179        this.getContext().getLogger().log(Level.WARNING, msg);
180        continue;
181      }
182
183      // iterate over all verb-clause pairs
184      List<TreebankNodeLink> links = new ArrayList<TreebankNodeLink>();
185      this.collectVerbClausePairs(tree, links);
186      for (TreebankNodeLink link : links) {
187
188        Token sourceToken = JCasUtil.selectCovered(jCas, Token.class, link.source).get(0);
189        Token targetToken = JCasUtil.selectCovered(jCas, Token.class, link.target).get(0);
190        int firstEnd = Math.min(sourceToken.getEnd(), targetToken.getEnd());
191        int lastBegin = Math.max(sourceToken.getBegin(), targetToken.getBegin());
192
193        // create an instance and populate it with features
194        Instance<String> instance = new Instance<String>();
195        for (FeatureExtractor1<Token> extractor : this.sourceFeatureExtractors) {
196          instance.addAll(extractor.extract(jCas, sourceToken));
197        }
198        for (FeatureExtractor1<Token> extractor : this.targetFeatureExtractors) {
199          instance.addAll(extractor.extract(jCas, targetToken));
200        }
201        Annotation windowAnnotation = new Annotation(jCas, firstEnd, lastBegin);
202        for (FeatureExtractor1<Annotation> extractor : this.betweenAnchorsFeatureExtractors) {
203          instance.addAll(extractor.extract(jCas, windowAnnotation));
204        }
205        instance.addAll(this.pathExtractor.extract(jCas, link.source, link.target));
206
207        // find source and target anchors if they're available
208        Anchor source = AnnotationUtil.selectFirstMatching(jCas, Anchor.class, link.source);
209        Anchor target = AnnotationUtil.selectFirstMatching(jCas, Anchor.class, link.target);
210
211        // if we're building training data, get the relation type from a
212        // TLINK
213        if (this.isTraining()) {
214          if (source != null && target != null) {
215            String key = String.format("%s:%s", source.getId(), target.getId());
216            TemporalLink tlink = tlinks.remove(key);
217            if (tlink != null) {
218              instance.setOutcome(tlink.getRelationType());
219              this.dataWriter.write(instance);
220            }
221          }
222        }
223
224        // if we're classifying create new TLINKs from the
225        // classification outcomes
226        else {
227          source = this.getOrCreateEvent(jCas, source, link.source);
228          target = this.getOrCreateEvent(jCas, target, link.target);
229          // only create TLINKs for events that exist (or were created, if requested)
230          if (source != null && target != null) {
231            String relationType = this.classifier.classify(instance.getFeatures());
232            TemporalLink tlink = new TemporalLink(jCas, docEnd, docEnd);
233            tlink.setSource(source);
234            tlink.setTarget(target);
235            tlink.setRelationType(relationType);
236            tlink.addToIndexes();
237          }
238        }
239      }
240    }
241  }
242
243  private Event getOrCreateEvent(JCas jCas, Anchor anchor, TreebankNode node) {
244    if (anchor != null && anchor instanceof Event) {
245      return (Event) anchor;
246    } else if (this.createEvents) {
247      Event event = new Event(jCas, node.getBegin(), node.getEnd());
248      event.setId("e" + this.eventID);
249      this.eventID++;
250      event.addToIndexes();
251      return event;
252    } else {
253      return null;
254    }
255  }
256
257  private Map<String, TemporalLink> getTemporalLinks(JCas jCas) {
258    Map<String, TemporalLink> tlinks = new HashMap<String, TemporalLink>();
259    for (TemporalLink tlink : JCasUtil.select(jCas, TemporalLink.class)) {
260      String sourceID = tlink.getSource().getId();
261      String targetID = tlink.getTarget().getId();
262      String key = String.format("%s:%s", sourceID, targetID);
263      tlinks.put(key, tlink);
264    }
265    return tlinks;
266  }
267
268  private void collectVerbClausePairs(TreebankNode node, List<TreebankNodeLink> links) {
269    if (this.isVerbPhrase(node)) {
270      List<TreebankNode> sources = new ArrayList<TreebankNode>();
271      List<TreebankNode> targets = new ArrayList<TreebankNode>();
272      this.collectHeads(node, sources);
273
274      // look for clauses in descendants
275      for (int i = 0; i < node.getChildren().size(); i++) {
276        TreebankNode child = node.getChildren(i);
277        if (this.isClause(child)) {
278
279          // pair the verb phrase heads with the clause heads
280          targets.clear();
281          this.collectHeads(child, targets);
282          for (TreebankNode source : sources) {
283            for (TreebankNode target : targets) {
284
285              // skip pairs where the head of the VP is inside the
286              // clause
287              if (!this.contains(child, source)) {
288                links.add(new TreebankNodeLink(source, target));
289              }
290            }
291          }
292        }
293      }
294    }
295    // look for verb phrases in descendants
296    for (int i = 0; i < node.getChildren().size(); i++) {
297      TreebankNode child = node.getChildren(i);
298      this.collectVerbClausePairs(child, links);
299    }
300  }
301
302  private void collectHeads(TreebankNode node, List<TreebankNode> heads) {
303    if (node.getLeaf()) {
304      heads.add(node);
305    }
306    String[] headTypes = VerbClauseTemporalAnnotator.headMap.get(node.getNodeType());
307    if (headTypes != null) {
308      for (String headType : headTypes) {
309        boolean foundChildWithHeadType = false;
310        for (int i = 0; i < node.getChildren().size(); i++) {
311          TreebankNode child = node.getChildren(i);
312          if (child.getNodeType().equals(headType)) {
313            String text = child.getCoveredText();
314            if (!VerbClauseTemporalAnnotator.stopWords.contains(text)) {
315              this.collectHeads(child, heads);
316              foundChildWithHeadType = true;
317            }
318          }
319        }
320        if (foundChildWithHeadType) {
321          break;
322        }
323      }
324    }
325  }
326
327  private boolean contains(TreebankNode node, TreebankNode descendant) {
328    if (node == descendant) {
329      return true;
330    }
331    for (int i = 0; i < node.getChildren().size(); i++) {
332      boolean result = this.contains(node.getChildren(i), descendant);
333      if (result) {
334        return true;
335      }
336    }
337    return false;
338  }
339
340  private boolean isVerbPhrase(TreebankNode node) {
341    return node.getNodeType().startsWith("VP");
342  }
343
344  private boolean isClause(TreebankNode node) {
345    return node.getNodeType().startsWith("S");
346  }
347
348  private class TreebankNodeLink {
349    public TreebankNode source;
350
351    public TreebankNode target;
352
353    public TreebankNodeLink(TreebankNode source, TreebankNode target) {
354      this.source = source;
355      this.target = target;
356    }
357  }
358
359}