001/*
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.timeml.tlink;
025
026import static org.cleartk.syntax.constituent.type.TreebankNodeUtil.selectHighestCoveredTreebankNode;
027import static org.cleartk.syntax.constituent.type.TreebankNodeUtil.selectMatchingLeaf;
028
029import java.util.ArrayList;
030import java.util.List;
031import java.util.regex.Pattern;
032
033import org.apache.uima.UimaContext;
034import org.apache.uima.analysis_engine.AnalysisEngineDescription;
035import org.apache.uima.jcas.JCas;
036import org.apache.uima.resource.ResourceInitializationException;
037import org.cleartk.ml.feature.extractor.CleartkExtractor;
038import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
039import org.cleartk.ml.feature.extractor.FeatureExtractor1;
040import org.cleartk.ml.feature.extractor.FeatureExtractor2;
041import org.cleartk.ml.feature.extractor.TypePathExtractor;
042import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
043import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
044import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
045import org.cleartk.ml.feature.extractor.CleartkExtractor.Ngram;
046import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
047import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
048import org.cleartk.syntax.constituent.type.TreebankNode;
049import org.cleartk.syntax.constituent.type.TreebankNodeUtil;
050import org.cleartk.timeml.type.Anchor;
051import org.cleartk.timeml.type.Event;
052import org.cleartk.timeml.type.Time;
053import org.cleartk.timeml.util.CleartkInternalModelFactory;
054import org.cleartk.feature.FilteringExtractor;
055import org.cleartk.token.type.Sentence;
056import org.cleartk.token.type.Token;
057import org.apache.uima.fit.factory.AnalysisEngineFactory;
058import org.apache.uima.fit.util.JCasUtil;
059
060import com.google.common.collect.Lists;
061
062/**
063 * <br>
064 * Copyright (c) 2011, Regents of the University of Colorado <br>
065 * All rights reserved.
066 * 
067 * @author Steven Bethard
068 */
069public class TemporalLinkEventToSameSentenceTimeAnnotator extends
070    TemporalLinkAnnotator_ImplBase<Event, Time> {
071
072  public static final CleartkInternalModelFactory FACTORY = new CleartkInternalModelFactory() {
073    @Override
074    public Class<?> getAnnotatorClass() {
075      return TemporalLinkEventToSameSentenceTimeAnnotator.class;
076    }
077
078    @Override
079    public Class<?> getDataWriterClass() {
080      return LibLinearStringOutcomeDataWriter.class;
081    }
082
083    @Override
084    public AnalysisEngineDescription getBaseDescription() throws ResourceInitializationException {
085      return AnalysisEngineFactory.createEngineDescription(TemporalLinkEventToSameSentenceTimeAnnotator.class);
086    }
087  };
088
089  public TemporalLinkEventToSameSentenceTimeAnnotator() {
090    super(Event.class, Time.class, "INCLUDES", "IS_INCLUDED");
091  }
092
093  private static final Pattern SUBORDINATE_PATH_PATTERN = Pattern.compile("^((NP|PP|ADVP)>)*((VP|SBAR|S)>)*(S|SBAR|VP|NP)(<(VP|SBAR|S))*(<(NP|PP|ADVP))*$");
094
095  @Override
096  public void initialize(UimaContext context) throws ResourceInitializationException {
097    super.initialize(context);
098
099    final FeatureExtractor1<Token> prepOrVerbExtractor = new FilteringExtractor<Token>(
100        Token.class,
101        new CoveredTextExtractor<Token>()) {
102      @Override
103      protected boolean accept(Token token) {
104        return token.getPos().equals("TO") || token.getPos().equals("IN")
105            || token.getPos().startsWith("VB");
106      }
107    };
108
109    
110    List<FeatureExtractor1<Event>> srcExtractors = Lists.newArrayList();
111    srcExtractors.add(new TypePathExtractor<Event>(Event.class, "tense"));
112    srcExtractors.add(new TypePathExtractor<Event>(Event.class, "eventClass"));
113    srcExtractors.add(new CleartkExtractor<Event, Token>(Token.class, prepOrVerbExtractor, new Ngram(new Following(5))));
114    this.setSourceExtractors(srcExtractors);
115    
116    List<FeatureExtractor1<Time>> tgtExtractors = Lists.newArrayList();
117    tgtExtractors.add(new CleartkExtractor<Time, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered())));
118    tgtExtractors.add(new TypePathExtractor<Time>(Time.class, "timeType"));
119    tgtExtractors.add(new TypePathExtractor<Time>(Time.class, "value"));
120    tgtExtractors.add(new CleartkExtractor<Time, Token>(Token.class, prepOrVerbExtractor, new Ngram(new Preceding(5))));
121    this.setTargetExtractors(tgtExtractors);
122
123//    this.setTargetExtractors(Arrays.asList(
124//        new CleartkExtractor<Time, Token>(Token.class, new CoveredTextExtractor(), new Bag(new Covered())),
125//        new TypePathExtractor<Time>(Time.class, "timeType"),
126//        new TypePathExtractor<Time>(Time.class, "value"),
127//        new CleartkExtractor<Time, Token>(Token.class, prepOrVerbExtractor, new Ngram(new Preceding(5)))));
128
129    // this will probably only extract when the source (Event) precedes the target (Time)
130    
131    List<FeatureExtractor2<Anchor, Anchor>> btweenExtractors = Lists.newArrayList();
132    btweenExtractors.add(new CleartkExtractor<Anchor, Token>(
133        Token.class,
134        prepOrVerbExtractor,
135        new Bag(new Covered())));
136  }
137
138  @Override
139  protected List<SourceTargetPair> getSourceTargetPairs(JCas jCas) {
140    List<SourceTargetPair> pairs = Lists.newArrayList();
141    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
142      for (Event event : JCasUtil.selectCovered(jCas, Event.class, sentence)) {
143        for (Time time : getSubordinateTimes(event, sentence, jCas)) {
144          pairs.add(new SourceTargetPair(event, time));
145        }
146      }
147    }
148    return pairs;
149  }
150
151  private static List<Time> getSubordinateTimes(Event event, Sentence sentence, JCas jCas) {
152    List<Time> times = new ArrayList<Time>();
153    TreebankNode eventNode = selectMatchingLeaf(jCas, event);
154    for (Time time : JCasUtil.selectCovered(jCas, Time.class, sentence)) {
155      TreebankNode timeNode = selectHighestCoveredTreebankNode(jCas, time);
156      if (eventNode != null && timeNode != null) {
157        String path = noLeavesPath(TreebankNodeUtil.getPath(eventNode, timeNode));
158        if (SUBORDINATE_PATH_PATTERN.matcher(path).matches()) {
159          times.add(time);
160        }
161      }
162    }
163    return times;
164  }
165}