001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.timeml.tlink; 025 026import static org.cleartk.syntax.constituent.type.TreebankNodeUtil.selectHighestCoveredTreebankNode; 027import static org.cleartk.syntax.constituent.type.TreebankNodeUtil.selectMatchingLeaf; 028 029import java.util.ArrayList; 030import java.util.List; 031import java.util.regex.Pattern; 032 033import org.apache.uima.UimaContext; 034import org.apache.uima.analysis_engine.AnalysisEngineDescription; 035import org.apache.uima.jcas.JCas; 036import org.apache.uima.resource.ResourceInitializationException; 037import org.cleartk.ml.feature.extractor.CleartkExtractor; 038import org.cleartk.ml.feature.extractor.CoveredTextExtractor; 039import org.cleartk.ml.feature.extractor.FeatureExtractor1; 040import org.cleartk.ml.feature.extractor.FeatureExtractor2; 041import org.cleartk.ml.feature.extractor.TypePathExtractor; 042import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag; 043import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered; 044import org.cleartk.ml.feature.extractor.CleartkExtractor.Following; 045import org.cleartk.ml.feature.extractor.CleartkExtractor.Ngram; 046import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding; 047import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter; 048import org.cleartk.syntax.constituent.type.TreebankNode; 049import org.cleartk.syntax.constituent.type.TreebankNodeUtil; 050import org.cleartk.timeml.type.Anchor; 051import org.cleartk.timeml.type.Event; 052import org.cleartk.timeml.type.Time; 053import org.cleartk.timeml.util.CleartkInternalModelFactory; 054import org.cleartk.feature.FilteringExtractor; 055import org.cleartk.token.type.Sentence; 056import org.cleartk.token.type.Token; 057import org.apache.uima.fit.factory.AnalysisEngineFactory; 058import org.apache.uima.fit.util.JCasUtil; 059 060import com.google.common.collect.Lists; 061 062/** 063 * <br> 064 * Copyright (c) 2011, Regents of the University of Colorado <br> 065 * All rights reserved. 066 * 067 * @author Steven Bethard 068 */ 069public class TemporalLinkEventToSameSentenceTimeAnnotator extends 070 TemporalLinkAnnotator_ImplBase<Event, Time> { 071 072 public static final CleartkInternalModelFactory FACTORY = new CleartkInternalModelFactory() { 073 @Override 074 public Class<?> getAnnotatorClass() { 075 return TemporalLinkEventToSameSentenceTimeAnnotator.class; 076 } 077 078 @Override 079 public Class<?> getDataWriterClass() { 080 return LibLinearStringOutcomeDataWriter.class; 081 } 082 083 @Override 084 public AnalysisEngineDescription getBaseDescription() throws ResourceInitializationException { 085 return AnalysisEngineFactory.createEngineDescription(TemporalLinkEventToSameSentenceTimeAnnotator.class); 086 } 087 }; 088 089 public TemporalLinkEventToSameSentenceTimeAnnotator() { 090 super(Event.class, Time.class, "INCLUDES", "IS_INCLUDED"); 091 } 092 093 private static final Pattern SUBORDINATE_PATH_PATTERN = Pattern.compile("^((NP|PP|ADVP)>)*((VP|SBAR|S)>)*(S|SBAR|VP|NP)(<(VP|SBAR|S))*(<(NP|PP|ADVP))*$"); 094 095 @Override 096 public void initialize(UimaContext context) throws ResourceInitializationException { 097 super.initialize(context); 098 099 final FeatureExtractor1<Token> prepOrVerbExtractor = new FilteringExtractor<Token>( 100 Token.class, 101 new CoveredTextExtractor<Token>()) { 102 @Override 103 protected boolean accept(Token token) { 104 return token.getPos().equals("TO") || token.getPos().equals("IN") 105 || token.getPos().startsWith("VB"); 106 } 107 }; 108 109 110 List<FeatureExtractor1<Event>> srcExtractors = Lists.newArrayList(); 111 srcExtractors.add(new TypePathExtractor<Event>(Event.class, "tense")); 112 srcExtractors.add(new TypePathExtractor<Event>(Event.class, "eventClass")); 113 srcExtractors.add(new CleartkExtractor<Event, Token>(Token.class, prepOrVerbExtractor, new Ngram(new Following(5)))); 114 this.setSourceExtractors(srcExtractors); 115 116 List<FeatureExtractor1<Time>> tgtExtractors = Lists.newArrayList(); 117 tgtExtractors.add(new CleartkExtractor<Time, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered()))); 118 tgtExtractors.add(new TypePathExtractor<Time>(Time.class, "timeType")); 119 tgtExtractors.add(new TypePathExtractor<Time>(Time.class, "value")); 120 tgtExtractors.add(new CleartkExtractor<Time, Token>(Token.class, prepOrVerbExtractor, new Ngram(new Preceding(5)))); 121 this.setTargetExtractors(tgtExtractors); 122 123// this.setTargetExtractors(Arrays.asList( 124// new CleartkExtractor<Time, Token>(Token.class, new CoveredTextExtractor(), new Bag(new Covered())), 125// new TypePathExtractor<Time>(Time.class, "timeType"), 126// new TypePathExtractor<Time>(Time.class, "value"), 127// new CleartkExtractor<Time, Token>(Token.class, prepOrVerbExtractor, new Ngram(new Preceding(5))))); 128 129 // this will probably only extract when the source (Event) precedes the target (Time) 130 131 List<FeatureExtractor2<Anchor, Anchor>> btweenExtractors = Lists.newArrayList(); 132 btweenExtractors.add(new CleartkExtractor<Anchor, Token>( 133 Token.class, 134 prepOrVerbExtractor, 135 new Bag(new Covered()))); 136 } 137 138 @Override 139 protected List<SourceTargetPair> getSourceTargetPairs(JCas jCas) { 140 List<SourceTargetPair> pairs = Lists.newArrayList(); 141 for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { 142 for (Event event : JCasUtil.selectCovered(jCas, Event.class, sentence)) { 143 for (Time time : getSubordinateTimes(event, sentence, jCas)) { 144 pairs.add(new SourceTargetPair(event, time)); 145 } 146 } 147 } 148 return pairs; 149 } 150 151 private static List<Time> getSubordinateTimes(Event event, Sentence sentence, JCas jCas) { 152 List<Time> times = new ArrayList<Time>(); 153 TreebankNode eventNode = selectMatchingLeaf(jCas, event); 154 for (Time time : JCasUtil.selectCovered(jCas, Time.class, sentence)) { 155 TreebankNode timeNode = selectHighestCoveredTreebankNode(jCas, time); 156 if (eventNode != null && timeNode != null) { 157 String path = noLeavesPath(TreebankNodeUtil.getPath(eventNode, timeNode)); 158 if (SUBORDINATE_PATH_PATTERN.matcher(path).matches()) { 159 times.add(time); 160 } 161 } 162 } 163 return times; 164 } 165}