001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.corpus.conll2005; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.StringReader; 029import java.util.ArrayList; 030import java.util.List; 031import java.util.NoSuchElementException; 032import java.util.Stack; 033 034import org.apache.uima.UimaContext; 035import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 036import org.apache.uima.cas.CAS; 037import org.apache.uima.cas.CASException; 038import org.apache.uima.jcas.JCas; 039import org.apache.uima.jcas.cas.FSArray; 040import org.apache.uima.jcas.tcas.Annotation; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.cleartk.ne.type.NamedEntityMention; 043import org.cleartk.srl.type.Chunk; 044import org.cleartk.srl.type.Predicate; 045import org.cleartk.srl.type.SemanticArgument; 046import org.cleartk.syntax.constituent.type.TopTreebankNode; 047import org.cleartk.syntax.constituent.type.TreebankNode; 048import org.cleartk.syntax.constituent.type.TreebankNodeUtil; 049import org.cleartk.token.type.Sentence; 050import org.cleartk.token.type.Token; 051import org.cleartk.util.AnnotationUtil; 052import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 053import org.apache.uima.fit.descriptor.ConfigurationParameter; 054import org.apache.uima.fit.descriptor.SofaCapability; 055import org.apache.uima.fit.util.FSCollectionFactory; 056 057/** 058 * <br> 059 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 060 * All rights reserved. 061 */ 062@SofaCapability( 063 inputSofas = { Conll2005Constants.CONLL_2005_VIEW, CAS.NAME_DEFAULT_SOFA }, 064 outputSofas = {}) 065public class Conll2005GoldAnnotator extends JCasAnnotator_ImplBase { 066 067 @ConfigurationParameter( 068 name = PARAM_HAS_VERB_SENSES, 069 mandatory = true, 070 description = "does the data file contain verb sense tags") 071 private Boolean hasVerbSenses; 072 073 public static final String PARAM_HAS_VERB_SENSES = "hasVerbSenses"; 074 075 @Override 076 public void initialize(UimaContext context) throws ResourceInitializationException { 077 super.initialize(context); 078 } 079 080 @Override 081 public void process(JCas jCas) throws AnalysisEngineProcessException { 082 try { 083 JCas conllView = jCas.getView(Conll2005Constants.CONLL_2005_VIEW); 084 JCas initView = jCas.getView(CAS.NAME_DEFAULT_SOFA); 085 086 String conllText = conllView.getSofaDataString(); 087 088 List<Conll2005Line> conll2005Lines = new ArrayList<Conll2005Line>(); 089 for (String line : conllText.split("\n")) { 090 conll2005Lines.add(new Conll2005Line(line, hasVerbSenses)); 091 // System.err.println(line); 092 } 093 // System.err.println(); 094 095 StringBuffer docText = new StringBuffer(); 096 097 List<TreebankNode> terminals = new ArrayList<TreebankNode>(conll2005Lines.size()); 098 CharniakParseParser parser = new CharniakParseParser(initView); 099 100 int numberOfPredicates = 0; 101 for (Conll2005Line line : conll2005Lines) 102 if (!line.targetVerb.equals("-")) 103 numberOfPredicates += 1; 104 105 int currentPredicate = 0; 106 PredicateParser predicateParsers[] = new PredicateParser[numberOfPredicates]; 107 for (int i = 0; i < numberOfPredicates; i++) 108 predicateParsers[i] = new PredicateParser(initView); 109 110 NamedEntityParser namedEntityParser = new NamedEntityParser(initView); 111 112 for (Conll2005Line line : conll2005Lines.toArray(new Conll2005Line[0])) { 113 if (line.argumentSegments.length != 0 && line.argumentSegments.length != numberOfPredicates) { 114 throw new RuntimeException(String.format( 115 "expected 0 or %d segments, found %d", 116 numberOfPredicates, 117 line.argumentSegments.length)); 118 } 119 120 if (docText.length() > 0 && line.word.length() > 0) { 121 docText.append(" "); 122 } 123 int startIndex = docText.length(); 124 docText.append(line.word); 125 int endIndex = docText.length(); 126 127 Token token = new Token(initView, startIndex, endIndex); 128 token.setPos(line.pos); 129 token.addToIndexes(); 130 131 TreebankNode terminal = new TreebankNode(initView, startIndex, endIndex); 132 terminal.setNodeType(line.pos); 133 terminal.setNodeValue(line.word); 134 terminal.setChildren(new FSArray(jCas, 0)); 135 terminal.setLeaf(true); 136 terminal.addToIndexes(); 137 terminals.add(terminal); 138 139 parser.feed(line.charniakParseSegment, terminal); 140 141 namedEntityParser.feed(line.neSegment, token); 142 143 if (line.argumentSegments.length > 0) { 144 for (int i = 0; i < numberOfPredicates; i++) { 145 predicateParsers[i].feed(line.argumentSegments[i], token); 146 } 147 } 148 149 if (!line.targetVerb.equals("-")) { 150 predicateParsers[currentPredicate].feedInfo( 151 line.word, 152 line.targetVerb, 153 line.verbSenseTag, 154 token); 155 currentPredicate += 1; 156 } 157 } 158 initView.setSofaDataString(docText.toString(), "text/plain"); 159 160 Sentence sentence = new Sentence(initView, 0, docText.toString().length()); 161 sentence.addToIndexes(); 162 163 parser.makeParse(); 164 165 for (PredicateParser predicateParser : predicateParsers) 166 predicateParser.makePredicate(); 167 168 } catch (CASException e) { 169 throw new AnalysisEngineProcessException(e); 170 } catch (IOException e) { 171 throw new AnalysisEngineProcessException(e); 172 } 173 } 174 175 private static class Conll2005Line { 176 String word; 177 178 String pos; 179 180 String charniakParseSegment; 181 182 String neSegment; 183 184 String verbSenseTag; 185 186 String targetVerb; 187 188 String argumentSegments[]; 189 190 Conll2005Line(String line, boolean hasSenseTag) { 191 String fields[] = line.split("\\s+"); 192 int i = 0; 193 this.word = fields[i++].trim(); 194 this.pos = fields[i++].trim(); 195 this.charniakParseSegment = fields[i++].trim(); 196 this.neSegment = fields[i++].trim(); 197 198 if (hasSenseTag) { 199 this.verbSenseTag = fields[i++].trim(); 200 } else { 201 this.verbSenseTag = null; 202 } 203 204 this.targetVerb = fields[i++].trim(); 205 206 this.argumentSegments = new String[fields.length - i]; 207 for (int j = 0; j < argumentSegments.length; j++) { 208 this.argumentSegments[j] = fields[i++].trim(); 209 } 210 } 211 } 212 213 private static class Constituent { 214 String type; 215 216 List<TreebankNode> children; 217 218 Constituent(String type) { 219 this.type = type; 220 this.children = new ArrayList<TreebankNode>(); 221 } 222 223 // Constituent() { 224 // this(null); 225 // } 226 227 public void addChild(TreebankNode newChild) { 228 this.children.add(newChild); 229 } 230 231 public TreebankNode makeTreebankNode(JCas jCas) { 232 if (this.type.equals("S1")) { 233 return this.children.get(0); 234 } else { 235 int[] span = AnnotationUtil.getAnnotationsExtent(this.children); 236 TreebankNode node = new TreebankNode(jCas, span[0], span[1]); 237 node.setNodeType(this.type); 238 node.setChildren(new FSArray(jCas, this.children.size())); 239 FSCollectionFactory.fillArrayFS(node.getChildren(), this.children); 240 for (TreebankNode child : this.children) 241 child.setParent(node); 242 node.addToIndexes(); 243 return node; 244 } 245 } 246 } 247 248 private static class CharniakParseParser { 249 Stack<Constituent> parseStack; 250 251 List<TreebankNode> terminals; 252 253 JCas jCas; 254 255 CharniakParseParser(JCas jCas) { 256 parseStack = new Stack<Constituent>(); 257 parseStack.push(new Constituent("TOP")); 258 terminals = new ArrayList<TreebankNode>(); 259 this.jCas = jCas; 260 } 261 262 void feed(String segment, TreebankNode terminal) throws IOException { 263 BufferedReader r = new BufferedReader(new StringReader(segment)); 264 265 terminals.add(terminal); 266 267 for (int i = r.read(); i != -1; i = r.read()) { 268 char c = (char) i; 269 switch (c) { 270 case '*': 271 parseStack.peek().addChild(terminal); 272 break; 273 case '(': 274 parseStack.push(new Constituent(readNodeType(r))); 275 break; 276 case ')': 277 TreebankNode node = parseStack.pop().makeTreebankNode(jCas); 278 parseStack.peek().addChild(node); 279 break; 280 default: 281 throw new IOException("unexpected character in string: " + String.valueOf(c) + " (" 282 + String.valueOf((int) c) + ")"); 283 } 284 } 285 } 286 287 public TopTreebankNode makeParse() { 288 int[] span = AnnotationUtil.getAnnotationsExtent(this.terminals); 289 TopTreebankNode node = new TopTreebankNode(jCas, span[0], span[1]); 290 node.setNodeType("TOP"); 291 List<TreebankNode> children = parseStack.peek().children; 292 node.setChildren(new FSArray(jCas, children.size())); 293 FSCollectionFactory.fillArrayFS(node.getChildren(), children); 294 for (TreebankNode child : parseStack.peek().children) 295 child.setParent(node); 296 node.setTerminals(new FSArray(jCas, this.terminals.size())); 297 FSCollectionFactory.fillArrayFS(node.getTerminals(), this.terminals); 298 node.addToIndexes(); 299 parseStack.pop(); 300 return node; 301 } 302 303 private static String readNodeType(BufferedReader r) throws IOException { 304 StringBuffer b = new StringBuffer(); 305 306 while (r.ready()) { 307 r.mark(1); 308 char c = (char) r.read(); 309 if (c == '(' || c == ')' || c == '*') { 310 r.reset(); 311 break; 312 } else { 313 b.append(c); 314 } 315 } 316 317 return b.toString(); 318 } 319 } 320 321 private static class PredicateParser { 322 JCas jCas; 323 324 // String token; 325 String baseForm; 326 327 // String sense; 328 Token predicateToken; 329 330 List<SemanticArgument> arguments; 331 332 List<Token> argumentTokens; 333 334 String argumentType; 335 336 PredicateParser(JCas jCas) { 337 this.jCas = jCas; 338 this.arguments = new ArrayList<SemanticArgument>(); 339 } 340 341 void feedInfo(String tokenText, String bForm, String sense, Token token) { 342 if (token == null) { 343 throw new RuntimeException(String.format("token for \"%s\" is null", tokenText)); 344 } 345 // this.token = tokenText; 346 this.baseForm = bForm; 347 // this.sense = sense; 348 this.predicateToken = token; 349 } 350 351 void feed(String segment, Token token) throws IOException { 352 BufferedReader r = new BufferedReader(new StringReader(segment)); 353 354 for (int i = r.read(); i != -1; i = r.read()) { 355 char c = (char) i; 356 357 switch (c) { 358 case '(': 359 this.argumentTokens = new ArrayList<Token>(); 360 this.argumentType = readArgumentType(r); 361 break; 362 case ')': 363 int[] span = AnnotationUtil.getAnnotationsExtent(this.argumentTokens); 364 SemanticArgument arg = new SemanticArgument(jCas, span[0], span[1]); 365 arg.addToIndexes(); 366 Annotation relation = TreebankNodeUtil.selectHighestMatchingTreebankNode(jCas, arg); 367 if (relation == null) { 368 Chunk chunk = new Chunk(jCas, span[0], span[1]); 369 relation = chunk; 370 } 371 arg.setAnnotation(relation); 372 arg.setLabel(this.argumentType); 373 arg.addToIndexes(); 374 this.arguments.add(arg); 375 376 this.argumentTokens = null; 377 break; 378 case '*': 379 if (this.argumentTokens != null) 380 this.argumentTokens.add(token); 381 break; 382 default: 383 throw new IOException("unexpected character in string: " + String.valueOf(c) + " (" 384 + String.valueOf((int) c) + ")"); 385 } 386 } 387 } 388 389 Predicate makePredicate() { 390 if (this.predicateToken == null) { 391 throw new RuntimeException("no predicateToken found yet"); 392 } 393 Predicate predicate = new Predicate( 394 jCas, 395 this.predicateToken.getBegin(), 396 this.predicateToken.getEnd()); 397 predicate.setAnnotation(this.predicateToken); 398 predicate.setArguments(new FSArray(jCas, this.arguments.size())); 399 FSCollectionFactory.fillArrayFS(predicate.getArguments(), this.arguments); 400 predicate.setBaseForm(this.baseForm); 401 predicate.addToIndexes(); 402 403 return predicate; 404 } 405 406 private static String readArgumentType(BufferedReader r) throws IOException { 407 StringBuffer b = new StringBuffer(); 408 409 while (true) { 410 r.mark(1); 411 int i = r.read(); 412 if (i == -1) 413 break; 414 415 char c = (char) i; 416 if (c == '(' || c == ')' || c == '*') { 417 r.reset(); 418 break; 419 } 420 421 b.append(c); 422 } 423 424 return b.toString(); 425 } 426 } 427 428 private static class NamedEntityParser { 429 430 public NamedEntityParser(JCas view) { 431 this.view = view; 432 } 433 434 void feed(String segment, Token token) throws IOException { 435 BufferedReader r = new BufferedReader(new StringReader(segment)); 436 437 for (int i = r.read(); i != -1; i = r.read()) { 438 char c = (char) i; 439 440 switch (c) { 441 case '(': 442 this.currentAnnotation = new NamedEntityAnnotation(); 443 this.currentAnnotation.begin = token.getBegin(); 444 this.currentAnnotation.name = readName(r); 445 break; 446 case ')': 447 this.currentAnnotation.end = token.getEnd(); 448 449 NamedEntityMention nem = new NamedEntityMention( 450 view, 451 this.currentAnnotation.begin, 452 this.currentAnnotation.end); 453 Annotation relation = null; 454 try { 455 relation = TreebankNodeUtil.selectHighestMatchingTreebankNode(view, nem); 456 } catch (NoSuchElementException e) { 457 } 458 nem.setAnnotation(relation); 459 nem.setMentionType(this.currentAnnotation.name); 460 nem.addToIndexes(); 461 462 this.currentAnnotation = null; 463 break; 464 case '*': 465 break; 466 default: 467 throw new IOException("unexpected character in string: " + String.valueOf(c) + " (" 468 + String.valueOf((int) c) + ")"); 469 } 470 } 471 } 472 473 private static String readName(BufferedReader r) throws IOException { 474 StringBuffer b = new StringBuffer(); 475 476 while (true) { 477 r.mark(1); 478 int i = r.read(); 479 if (i == -1) 480 break; 481 482 char c = (char) i; 483 if (c == '*') { 484 r.reset(); 485 break; 486 } 487 488 b.append(c); 489 } 490 491 return b.toString(); 492 } 493 494 JCas view; 495 496 NamedEntityAnnotation currentAnnotation = null; 497 498 private static class NamedEntityAnnotation { 499 public NamedEntityAnnotation() { 500 } 501 502 int begin; 503 504 int end; 505 506 String name; 507 } 508 } 509}