001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.timeml;
025
026import java.io.IOException;
027import java.io.StringReader;
028import java.util.ArrayList;
029import java.util.Arrays;
030import java.util.HashMap;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Map;
034import java.util.Set;
035
036import org.apache.uima.UimaContext;
037import org.apache.uima.analysis_engine.AnalysisEngineDescription;
038import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
039import org.apache.uima.cas.CAS;
040import org.apache.uima.cas.CASException;
041import org.apache.uima.jcas.JCas;
042import org.apache.uima.resource.ResourceInitializationException;
043import org.apache.uima.util.Level;
044import org.cleartk.timeml.type.Anchor;
045import org.cleartk.timeml.type.DocumentCreationTime;
046import org.cleartk.timeml.type.Event;
047import org.cleartk.timeml.type.TemporalLink;
048import org.cleartk.timeml.type.Text;
049import org.cleartk.timeml.type.Time;
050import org.cleartk.token.type.Sentence;
051import org.cleartk.util.ViewUriUtil;
052import org.jdom2.Content;
053import org.jdom2.Document;
054import org.jdom2.Element;
055import org.jdom2.JDOMException;
056import org.jdom2.input.SAXBuilder;
057import org.jdom2.output.XMLOutputter;
058import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
059import org.apache.uima.fit.descriptor.ConfigurationParameter;
060import org.apache.uima.fit.descriptor.SofaCapability;
061import org.apache.uima.fit.factory.AnalysisEngineFactory;
062
063/**
064 * <br>
065 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
066 * All rights reserved.
067 * 
068 * 
069 * @author Steven Bethard
070 * 
071 */
072@SofaCapability(inputSofas = { TimeMlGoldAnnotator.TIMEML_VIEW_NAME, CAS.NAME_DEFAULT_SOFA })
073public class TimeMlGoldAnnotator extends JCasAnnotator_ImplBase {
074
075  public static final String TIMEML_VIEW_NAME = "TimeMLView";
076
077  public static final String PARAM_LOAD_TLINKS = "loadTlinks";
078
079  @ConfigurationParameter(
080      name = PARAM_LOAD_TLINKS,
081      mandatory = false,
082      description = "when false indicates that annotation should not be created for TLINKs (though annotations will still be created for TIMEX3s, EVENTs, etc.).",
083      defaultValue = "true")
084  private boolean loadTlinks;
085
086  public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
087    return AnalysisEngineFactory.createEngineDescription(TimeMlGoldAnnotator.class);
088  }
089
090  public static AnalysisEngineDescription getDescriptionNoTLINKs()
091      throws ResourceInitializationException {
092    return AnalysisEngineFactory.createEngineDescription(
093        TimeMlGoldAnnotator.class,
094        PARAM_LOAD_TLINKS,
095        false);
096  }
097
098  @Override
099  public void initialize(UimaContext context) throws ResourceInitializationException {
100    super.initialize(context);
101  }
102
103  @Override
104  public void process(JCas jCas) throws AnalysisEngineProcessException {
105    JCas timemlView;
106    JCas initialView;
107    try {
108      timemlView = jCas.getView(TIMEML_VIEW_NAME);
109      initialView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
110    } catch (CASException e) {
111      throw new AnalysisEngineProcessException(e);
112    }
113
114    String timeML = timemlView.getDocumentText();
115    SAXBuilder builder = new SAXBuilder();
116    builder.setDTDHandler(null);
117    Element root;
118    try {
119      Document doc = builder.build(new StringReader(timeML));
120      root = doc.getRootElement();
121    } catch (JDOMException e) {
122      getContext().getLogger().log(
123          Level.SEVERE,
124          "problem parsing document: " + ViewUriUtil.getURI(jCas));
125      throw new AnalysisEngineProcessException(e);
126    } catch (IOException e) {
127      throw new AnalysisEngineProcessException(e);
128    }
129
130    // collect the document text, add the Event, Time and TemporalLink annotations,
131    // and collect the necessary information to fill in the cross-annotation links
132    StringBuffer textBuffer = new StringBuffer();
133    Map<String, Anchor> anchors = new HashMap<String, Anchor>();
134    Map<Time, String> anchorTimeIDs = new HashMap<Time, String>();
135    List<Element> makeInstances = new ArrayList<Element>();
136    Map<TemporalLink, String> tlinkSourceIDs = new HashMap<TemporalLink, String>();
137    Map<TemporalLink, String> tlinkTargetIDs = new HashMap<TemporalLink, String>();
138    this.addAnnotations(
139        initialView,
140        root,
141        textBuffer,
142        anchors,
143        anchorTimeIDs,
144        makeInstances,
145        tlinkSourceIDs,
146        tlinkTargetIDs);
147    initialView.setDocumentText(textBuffer.toString());
148
149    // point make-instance IDs to their events, and copy attributes over
150    Set<Event> processedEvents = new HashSet<Event>();
151    for (Element makeInstance : makeInstances) {
152      String eventID = makeInstance.getAttributeValue("eventID");
153      String eventInstanceID = makeInstance.getAttributeValue("eiid");
154      Event event = (Event) this.getAnchor(jCas, anchors, eventID);
155      anchors.put(eventInstanceID, event);
156      if (!processedEvents.contains(event)) {
157        TimeMlUtil.copyAttributes(makeInstance, event, jCas);
158        processedEvents.add(event);
159      } else {
160        String makeInstanceXML = new XMLOutputter().outputString(makeInstance);
161        String message = "Ignoring attributes from additional %s in %s";
162        String fileName = ViewUriUtil.getURI(jCas).toString();
163        this.getLogger().warn(String.format(message, makeInstanceXML, fileName));
164      }
165    }
166
167    // set anchor times
168    for (Time time : anchorTimeIDs.keySet()) {
169      Time anchorTime = (Time) this.getAnchor(jCas, anchors, anchorTimeIDs.get(time));
170      time.setAnchorTime(anchorTime);
171    }
172
173    // set tlink sources and targets
174    for (TemporalLink tlink : tlinkSourceIDs.keySet()) {
175      tlink.setSource(this.getAnchor(jCas, anchors, tlinkSourceIDs.get(tlink)));
176      tlink.setTarget(this.getAnchor(jCas, anchors, tlinkTargetIDs.get(tlink)));
177    }
178  }
179
180  private void addAnnotations(
181      JCas jCas,
182      Element element,
183      StringBuffer textBuffer,
184      Map<String, Anchor> anchors,
185      Map<Time, String> anchorTimeIDs,
186      List<Element> makeInstances,
187      Map<TemporalLink, String> tlinkSourceIDs,
188      Map<TemporalLink, String> tlinkTargetIDs) throws AnalysisEngineProcessException {
189    int startOffset = textBuffer.length();
190    for (Content content : element.getContent()) {
191      if (content instanceof org.jdom2.Text) {
192        textBuffer.append(((org.jdom2.Text) content).getText());
193      } else if (content instanceof Element) {
194        this.addAnnotations(
195            jCas,
196            (Element) content,
197            textBuffer,
198            anchors,
199            anchorTimeIDs,
200            makeInstances,
201            tlinkSourceIDs,
202            tlinkTargetIDs);
203      }
204    }
205    int endOffset = textBuffer.length();
206
207    String elementName = element.getName().toUpperCase();
208    if (elementName.equals("TIMEX3")) {
209      String funcInDoc = element.getAttributeValue("functionInDocument");
210      boolean isCreationTime = funcInDoc != null && funcInDoc.equals("CREATION_TIME");
211      Time time = isCreationTime
212          ? new DocumentCreationTime(jCas, startOffset, endOffset)
213          : new Time(jCas, startOffset, endOffset);
214      TimeMlUtil.copyAttributes(element, time, jCas);
215      String anchorTimeID = element.getAttributeValue("anchorTimeID");
216      if (anchorTimeID != null) {
217        anchorTimeIDs.put(time, anchorTimeID);
218      }
219      anchors.put(time.getId(), time);
220      time.addToIndexes();
221    } else if (elementName.equals("EVENT")) {
222      Event event = new Event(jCas, startOffset, endOffset);
223      TimeMlUtil.copyAttributes(element, event, jCas);
224      anchors.put(event.getId(), event);
225      event.addToIndexes();
226    } else if (elementName.equals("MAKEINSTANCE")) {
227      makeInstances.add(element);
228    } else if (elementName.equals("TLINK") && this.loadTlinks) {
229      TemporalLink temporalLink = new TemporalLink(jCas, startOffset, endOffset);
230      TimeMlUtil.copyAttributes(element, temporalLink, jCas);
231      String sourceID = this.getOneOf(element, "eventInstanceID", "eventID", "timeID");
232      String targetID = this.getOneOf(
233          element,
234          "relatedToEventInstance",
235          "relatedToEvent",
236          "relatedToTime");
237      tlinkSourceIDs.put(temporalLink, sourceID);
238      tlinkTargetIDs.put(temporalLink, targetID);
239      temporalLink.addToIndexes();
240    } else if (elementName.equals("TEXT")) {
241      Text text = new Text(jCas, startOffset, endOffset);
242      text.addToIndexes();
243    } else if (elementName.toLowerCase().equals("s")) {
244      Sentence sentence = new Sentence(jCas, startOffset, endOffset);
245      sentence.addToIndexes();
246    }
247  }
248
249  private String getOneOf(Element element, String... attributeNames) {
250    for (String name : attributeNames) {
251      String result = element.getAttributeValue(name);
252      if (result != null) {
253        return result;
254      }
255    }
256    throw new RuntimeException(String.format(
257        "unable to find in %s any of the following attributes: %s",
258        element,
259        Arrays.asList(attributeNames)));
260  }
261
262  private Anchor getAnchor(JCas jCas, Map<String, Anchor> anchors, String id)
263      throws AnalysisEngineProcessException {
264    Anchor anchor = anchors.get(id);
265    if (anchor == null) {
266      throw new RuntimeException(String.format(
267          "%s: no anchor for id %s",
268          ViewUriUtil.getURI(jCas),
269          id));
270    }
271    return anchor;
272  }
273
274  public void setLoadTlinks(boolean loadTLINKs) {
275    this.loadTlinks = loadTLINKs;
276  }
277}