001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.corpus.conll2005;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.StringReader;
029import java.util.ArrayList;
030import java.util.List;
031import java.util.NoSuchElementException;
032import java.util.Stack;
033
034import org.apache.uima.UimaContext;
035import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
036import org.apache.uima.cas.CAS;
037import org.apache.uima.cas.CASException;
038import org.apache.uima.jcas.JCas;
039import org.apache.uima.jcas.cas.FSArray;
040import org.apache.uima.jcas.tcas.Annotation;
041import org.apache.uima.resource.ResourceInitializationException;
042import org.cleartk.ne.type.NamedEntityMention;
043import org.cleartk.srl.type.Chunk;
044import org.cleartk.srl.type.Predicate;
045import org.cleartk.srl.type.SemanticArgument;
046import org.cleartk.syntax.constituent.type.TopTreebankNode;
047import org.cleartk.syntax.constituent.type.TreebankNode;
048import org.cleartk.syntax.constituent.type.TreebankNodeUtil;
049import org.cleartk.token.type.Sentence;
050import org.cleartk.token.type.Token;
051import org.cleartk.util.AnnotationUtil;
052import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
053import org.apache.uima.fit.descriptor.ConfigurationParameter;
054import org.apache.uima.fit.descriptor.SofaCapability;
055import org.apache.uima.fit.util.FSCollectionFactory;
056
057/**
058 * <br>
059 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
060 * All rights reserved.
061 */
062@SofaCapability(
063    inputSofas = { Conll2005Constants.CONLL_2005_VIEW, CAS.NAME_DEFAULT_SOFA },
064    outputSofas = {})
065public class Conll2005GoldAnnotator extends JCasAnnotator_ImplBase {
066
067  @ConfigurationParameter(
068      name = PARAM_HAS_VERB_SENSES,
069      mandatory = true,
070      description = "does the data file contain verb sense tags")
071  private Boolean hasVerbSenses;
072
073  public static final String PARAM_HAS_VERB_SENSES = "hasVerbSenses";
074
075  @Override
076  public void initialize(UimaContext context) throws ResourceInitializationException {
077    super.initialize(context);
078  }
079
080  @Override
081  public void process(JCas jCas) throws AnalysisEngineProcessException {
082    try {
083      JCas conllView = jCas.getView(Conll2005Constants.CONLL_2005_VIEW);
084      JCas initView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
085
086      String conllText = conllView.getSofaDataString();
087
088      List<Conll2005Line> conll2005Lines = new ArrayList<Conll2005Line>();
089      for (String line : conllText.split("\n")) {
090        conll2005Lines.add(new Conll2005Line(line, hasVerbSenses));
091        // System.err.println(line);
092      }
093      // System.err.println();
094
095      StringBuffer docText = new StringBuffer();
096
097      List<TreebankNode> terminals = new ArrayList<TreebankNode>(conll2005Lines.size());
098      CharniakParseParser parser = new CharniakParseParser(initView);
099
100      int numberOfPredicates = 0;
101      for (Conll2005Line line : conll2005Lines)
102        if (!line.targetVerb.equals("-"))
103          numberOfPredicates += 1;
104
105      int currentPredicate = 0;
106      PredicateParser predicateParsers[] = new PredicateParser[numberOfPredicates];
107      for (int i = 0; i < numberOfPredicates; i++)
108        predicateParsers[i] = new PredicateParser(initView);
109
110      NamedEntityParser namedEntityParser = new NamedEntityParser(initView);
111
112      for (Conll2005Line line : conll2005Lines.toArray(new Conll2005Line[0])) {
113        if (line.argumentSegments.length != 0 && line.argumentSegments.length != numberOfPredicates) {
114          throw new RuntimeException(String.format(
115              "expected 0 or %d segments, found %d",
116              numberOfPredicates,
117              line.argumentSegments.length));
118        }
119
120        if (docText.length() > 0 && line.word.length() > 0) {
121          docText.append(" ");
122        }
123        int startIndex = docText.length();
124        docText.append(line.word);
125        int endIndex = docText.length();
126
127        Token token = new Token(initView, startIndex, endIndex);
128        token.setPos(line.pos);
129        token.addToIndexes();
130
131        TreebankNode terminal = new TreebankNode(initView, startIndex, endIndex);
132        terminal.setNodeType(line.pos);
133        terminal.setNodeValue(line.word);
134        terminal.setChildren(new FSArray(jCas, 0));
135        terminal.setLeaf(true);
136        terminal.addToIndexes();
137        terminals.add(terminal);
138
139        parser.feed(line.charniakParseSegment, terminal);
140
141        namedEntityParser.feed(line.neSegment, token);
142
143        if (line.argumentSegments.length > 0) {
144          for (int i = 0; i < numberOfPredicates; i++) {
145            predicateParsers[i].feed(line.argumentSegments[i], token);
146          }
147        }
148
149        if (!line.targetVerb.equals("-")) {
150          predicateParsers[currentPredicate].feedInfo(
151              line.word,
152              line.targetVerb,
153              line.verbSenseTag,
154              token);
155          currentPredicate += 1;
156        }
157      }
158      initView.setSofaDataString(docText.toString(), "text/plain");
159
160      Sentence sentence = new Sentence(initView, 0, docText.toString().length());
161      sentence.addToIndexes();
162
163      parser.makeParse();
164
165      for (PredicateParser predicateParser : predicateParsers)
166        predicateParser.makePredicate();
167
168    } catch (CASException e) {
169      throw new AnalysisEngineProcessException(e);
170    } catch (IOException e) {
171      throw new AnalysisEngineProcessException(e);
172    }
173  }
174
175  private static class Conll2005Line {
176    String word;
177
178    String pos;
179
180    String charniakParseSegment;
181
182    String neSegment;
183
184    String verbSenseTag;
185
186    String targetVerb;
187
188    String argumentSegments[];
189
190    Conll2005Line(String line, boolean hasSenseTag) {
191      String fields[] = line.split("\\s+");
192      int i = 0;
193      this.word = fields[i++].trim();
194      this.pos = fields[i++].trim();
195      this.charniakParseSegment = fields[i++].trim();
196      this.neSegment = fields[i++].trim();
197
198      if (hasSenseTag) {
199        this.verbSenseTag = fields[i++].trim();
200      } else {
201        this.verbSenseTag = null;
202      }
203
204      this.targetVerb = fields[i++].trim();
205
206      this.argumentSegments = new String[fields.length - i];
207      for (int j = 0; j < argumentSegments.length; j++) {
208        this.argumentSegments[j] = fields[i++].trim();
209      }
210    }
211  }
212
213  private static class Constituent {
214    String type;
215
216    List<TreebankNode> children;
217
218    Constituent(String type) {
219      this.type = type;
220      this.children = new ArrayList<TreebankNode>();
221    }
222
223    // Constituent() {
224    // this(null);
225    // }
226
227    public void addChild(TreebankNode newChild) {
228      this.children.add(newChild);
229    }
230
231    public TreebankNode makeTreebankNode(JCas jCas) {
232      if (this.type.equals("S1")) {
233        return this.children.get(0);
234      } else {
235        int[] span = AnnotationUtil.getAnnotationsExtent(this.children);
236        TreebankNode node = new TreebankNode(jCas, span[0], span[1]);
237        node.setNodeType(this.type);
238        node.setChildren(new FSArray(jCas, this.children.size()));
239        FSCollectionFactory.fillArrayFS(node.getChildren(), this.children);
240        for (TreebankNode child : this.children)
241          child.setParent(node);
242        node.addToIndexes();
243        return node;
244      }
245    }
246  }
247
248  private static class CharniakParseParser {
249    Stack<Constituent> parseStack;
250
251    List<TreebankNode> terminals;
252
253    JCas jCas;
254
255    CharniakParseParser(JCas jCas) {
256      parseStack = new Stack<Constituent>();
257      parseStack.push(new Constituent("TOP"));
258      terminals = new ArrayList<TreebankNode>();
259      this.jCas = jCas;
260    }
261
262    void feed(String segment, TreebankNode terminal) throws IOException {
263      BufferedReader r = new BufferedReader(new StringReader(segment));
264
265      terminals.add(terminal);
266
267      for (int i = r.read(); i != -1; i = r.read()) {
268        char c = (char) i;
269        switch (c) {
270          case '*':
271            parseStack.peek().addChild(terminal);
272            break;
273          case '(':
274            parseStack.push(new Constituent(readNodeType(r)));
275            break;
276          case ')':
277            TreebankNode node = parseStack.pop().makeTreebankNode(jCas);
278            parseStack.peek().addChild(node);
279            break;
280          default:
281            throw new IOException("unexpected character in string: " + String.valueOf(c) + " ("
282                + String.valueOf((int) c) + ")");
283        }
284      }
285    }
286
287    public TopTreebankNode makeParse() {
288      int[] span = AnnotationUtil.getAnnotationsExtent(this.terminals);
289      TopTreebankNode node = new TopTreebankNode(jCas, span[0], span[1]);
290      node.setNodeType("TOP");
291      List<TreebankNode> children = parseStack.peek().children;
292      node.setChildren(new FSArray(jCas, children.size()));
293      FSCollectionFactory.fillArrayFS(node.getChildren(), children);
294      for (TreebankNode child : parseStack.peek().children)
295        child.setParent(node);
296      node.setTerminals(new FSArray(jCas, this.terminals.size()));
297      FSCollectionFactory.fillArrayFS(node.getTerminals(), this.terminals);
298      node.addToIndexes();
299      parseStack.pop();
300      return node;
301    }
302
303    private static String readNodeType(BufferedReader r) throws IOException {
304      StringBuffer b = new StringBuffer();
305
306      while (r.ready()) {
307        r.mark(1);
308        char c = (char) r.read();
309        if (c == '(' || c == ')' || c == '*') {
310          r.reset();
311          break;
312        } else {
313          b.append(c);
314        }
315      }
316
317      return b.toString();
318    }
319  }
320
321  private static class PredicateParser {
322    JCas jCas;
323
324    // String token;
325    String baseForm;
326
327    // String sense;
328    Token predicateToken;
329
330    List<SemanticArgument> arguments;
331
332    List<Token> argumentTokens;
333
334    String argumentType;
335
336    PredicateParser(JCas jCas) {
337      this.jCas = jCas;
338      this.arguments = new ArrayList<SemanticArgument>();
339    }
340
341    void feedInfo(String tokenText, String bForm, String sense, Token token) {
342      if (token == null) {
343        throw new RuntimeException(String.format("token for \"%s\" is null", tokenText));
344      }
345      // this.token = tokenText;
346      this.baseForm = bForm;
347      // this.sense = sense;
348      this.predicateToken = token;
349    }
350
351    void feed(String segment, Token token) throws IOException {
352      BufferedReader r = new BufferedReader(new StringReader(segment));
353
354      for (int i = r.read(); i != -1; i = r.read()) {
355        char c = (char) i;
356
357        switch (c) {
358          case '(':
359            this.argumentTokens = new ArrayList<Token>();
360            this.argumentType = readArgumentType(r);
361            break;
362          case ')':
363            int[] span = AnnotationUtil.getAnnotationsExtent(this.argumentTokens);
364            SemanticArgument arg = new SemanticArgument(jCas, span[0], span[1]);
365            arg.addToIndexes();
366            Annotation relation = TreebankNodeUtil.selectHighestMatchingTreebankNode(jCas, arg);
367            if (relation == null) {
368              Chunk chunk = new Chunk(jCas, span[0], span[1]);
369              relation = chunk;
370            }
371            arg.setAnnotation(relation);
372            arg.setLabel(this.argumentType);
373            arg.addToIndexes();
374            this.arguments.add(arg);
375
376            this.argumentTokens = null;
377            break;
378          case '*':
379            if (this.argumentTokens != null)
380              this.argumentTokens.add(token);
381            break;
382          default:
383            throw new IOException("unexpected character in string: " + String.valueOf(c) + " ("
384                + String.valueOf((int) c) + ")");
385        }
386      }
387    }
388
389    Predicate makePredicate() {
390      if (this.predicateToken == null) {
391        throw new RuntimeException("no predicateToken found yet");
392      }
393      Predicate predicate = new Predicate(
394          jCas,
395          this.predicateToken.getBegin(),
396          this.predicateToken.getEnd());
397      predicate.setAnnotation(this.predicateToken);
398      predicate.setArguments(new FSArray(jCas, this.arguments.size()));
399      FSCollectionFactory.fillArrayFS(predicate.getArguments(), this.arguments);
400      predicate.setBaseForm(this.baseForm);
401      predicate.addToIndexes();
402
403      return predicate;
404    }
405
406    private static String readArgumentType(BufferedReader r) throws IOException {
407      StringBuffer b = new StringBuffer();
408
409      while (true) {
410        r.mark(1);
411        int i = r.read();
412        if (i == -1)
413          break;
414
415        char c = (char) i;
416        if (c == '(' || c == ')' || c == '*') {
417          r.reset();
418          break;
419        }
420
421        b.append(c);
422      }
423
424      return b.toString();
425    }
426  }
427
428  private static class NamedEntityParser {
429
430    public NamedEntityParser(JCas view) {
431      this.view = view;
432    }
433
434    void feed(String segment, Token token) throws IOException {
435      BufferedReader r = new BufferedReader(new StringReader(segment));
436
437      for (int i = r.read(); i != -1; i = r.read()) {
438        char c = (char) i;
439
440        switch (c) {
441          case '(':
442            this.currentAnnotation = new NamedEntityAnnotation();
443            this.currentAnnotation.begin = token.getBegin();
444            this.currentAnnotation.name = readName(r);
445            break;
446          case ')':
447            this.currentAnnotation.end = token.getEnd();
448
449            NamedEntityMention nem = new NamedEntityMention(
450                view,
451                this.currentAnnotation.begin,
452                this.currentAnnotation.end);
453            Annotation relation = null;
454            try {
455              relation = TreebankNodeUtil.selectHighestMatchingTreebankNode(view, nem);
456            } catch (NoSuchElementException e) {
457            }
458            nem.setAnnotation(relation);
459            nem.setMentionType(this.currentAnnotation.name);
460            nem.addToIndexes();
461
462            this.currentAnnotation = null;
463            break;
464          case '*':
465            break;
466          default:
467            throw new IOException("unexpected character in string: " + String.valueOf(c) + " ("
468                + String.valueOf((int) c) + ")");
469        }
470      }
471    }
472
473    private static String readName(BufferedReader r) throws IOException {
474      StringBuffer b = new StringBuffer();
475
476      while (true) {
477        r.mark(1);
478        int i = r.read();
479        if (i == -1)
480          break;
481
482        char c = (char) i;
483        if (c == '*') {
484          r.reset();
485          break;
486        }
487
488        b.append(c);
489      }
490
491      return b.toString();
492    }
493
494    JCas view;
495
496    NamedEntityAnnotation currentAnnotation = null;
497
498    private static class NamedEntityAnnotation {
499      public NamedEntityAnnotation() {
500      }
501
502      int begin;
503
504      int end;
505
506      String name;
507    }
508  }
509}