001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.util.treebank; 025 026import java.util.ArrayList; 027import java.util.List; 028import java.util.Stack; 029import java.util.regex.Matcher; 030import java.util.regex.Pattern; 031 032/** 033 * This class was written to be a stand alone parser for the Penn Treebank data. Basically, I need a 034 * way to synch up the propbank data with extents of plain text that are labeled. This is not 035 * possible to do without parsing the treebank data first. The parse method will parse a single 036 * sentence from the treebank data from e.g. wsj/mrg/06/wsj_0656.mrg. 037 * 038 * I initially looked at the OpenNLP treebank parser but they made a few assumptions about they 039 * wanted to keep for the parser that would make it difficult to align with the propbank data. See: 040 * http://sourceforge.net/projects/opennlp/forums/forum/9943/topic/1751983 for relevant discussion. 041 * I looked at their parsing implementation and tried to modify it. However, I think the code below 042 * bears little resemblance to theirs. But there may yet be some snippets taken directly out of that 043 * code. The two regular expressions used are very similar. 044 * 045 * <br> 046 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 047 * All rights reserved. 048 * 049 * 050 * @author Philip Ogren 051 */ 052public class TreebankFormatParser { 053 /** 054 * used to identify tokens in Penn Treebank labeled constituents. It will match strings such as: 055 * <ul> 056 * <li>(NNP Community) 057 * <li>(: --) 058 * <li>(-NONE- *U*) 059 * </ul> 060 */ 061 public static final String LEAF_NODE_REGEX = "\\(([^( )]+) ([^( )]+)\\s*\\)"; 062 063 private static Pattern leafNodePattern = Pattern.compile(LEAF_NODE_REGEX); 064 065 /** 066 * Uses the leafNodePattern to identify a string as a terminal. Examples: 067 * <ul> 068 * <li>parseFragment = "(NNP Community)", returns a leaf node 069 * <li>parseFragment = "(QP ($ $) (CD 107) (CD million) )", returns null 070 * </ul> 071 * 072 * @param parseFragment 073 * some fragment of a treebank parse. 074 * @return if the string matches, then a node will be returned. Otherwise, null is returned. 075 */ 076 public static TreebankNode getLeafNode(String parseFragment) { 077 Matcher leafNodeMatcher = leafNodePattern.matcher(parseFragment); 078 if (leafNodeMatcher.matches()) { 079 String type = leafNodeMatcher.group(1); 080 String value = leafNodeMatcher.group(2); 081 TreebankNode node = new TreebankNode(); 082 node.setType(getTypeFromType(type)); 083 node.setTags(getTagsFromType(type)); 084 node.setValue(value); 085 node.setLeaf(true); 086 // the token and the value will almost always be the same except for 087 // a few special values (e.g. "-RCB-", "-LCB-", and "-NONE-") 088 String token = getToken(node.getValue(), node.getType()); 089 node.setText(token); 090 return node; 091 } 092 return null; 093 } 094 095 private static String getTypeFromType(String fullType) { 096 if (fullType.startsWith("-")) 097 return fullType.substring(0, fullType.indexOf('-', 1) + 1); 098 099 return fullType.split("[-=]")[0]; 100 } 101 102 private static String[] getTagsFromType(String fullType) { 103 if (fullType.startsWith("-")) { 104 String rest = fullType.substring(fullType.indexOf('-', 1) + 1); 105 if (rest.length() > 0) 106 return rest.split("[-=]"); 107 else 108 return new String[0]; 109 } else { 110 String[] parts = fullType.split("[-=]"); 111 String[] tags = new String[parts.length - 1]; 112 113 for (int i = 1; i < parts.length; i++) 114 tags[i - 1] = parts[i]; 115 return tags; 116 } 117 } 118 119 /** 120 * used to identify the type of a consituent in a treebank parse tree. It will match strings such 121 * as: 122 * <ul> 123 * <li>"NNP" in "(NNP Community)" 124 * <li>":" in "(: --)" 125 * <li>"-NONE-" in "(-NONE- *U*)" 126 * </ul> 127 */ 128 public static final String TYPE_REGEX = "^\\(([^() ]+)"; 129 130 private static Pattern typePattern = Pattern.compile(TYPE_REGEX); 131 132 /** 133 * Returns the type of a constituent of some fragment of a treebank parse. Assumes that the first 134 * character is a parenthesis. Examples: 135 * <ul> 136 * <li>parseFragment = "(NP-LOC (NNP Calif.) )" return = "NP-LOC" 137 * <li>parseFragment = "(NP" return "NP" 138 * <li>parseFragment = "(-NONE- *U*) ) (PP (IN of)" return = "-NONE-" 139 * </ul> 140 * 141 * @param parseFragment 142 * some fragment of a treebank parse 143 * @return the type of the constituent. 144 */ 145 public static String getType(String parseFragment) { 146 Matcher typeMatcher = typePattern.matcher(parseFragment); 147 if (typeMatcher.find()) 148 return typeMatcher.group(1); 149 return null; 150 } 151 152 public static final String cleanUPRegex1 = "\\s+"; 153 154 private static final Pattern cleanUpPattern1 = Pattern.compile(cleanUPRegex1, Pattern.MULTILINE); 155 156 public static final String cleanUPRegex2 = "\\( \\("; 157 158 private static final Pattern cleanUpPattern2 = Pattern.compile(cleanUPRegex2, Pattern.MULTILINE); 159 160 public static final String cleanUPRegex3 = "\\) \\)"; 161 162 private static final Pattern cleanUpPattern3 = Pattern.compile(cleanUPRegex3, Pattern.MULTILINE); 163 164 public static final String cleanUPRegex4 = "\\s*\\(\\s*\\("; 165 166 private static final Pattern cleanUpPattern4 = Pattern.compile(cleanUPRegex4, Pattern.MULTILINE); 167 168 /** 169 * This method was created simply as a way to clean up the parse string for a sentence in the 170 * treebank syntax. The most important thing that it does is add a type called TOP to the top node 171 * of the sentence. This simplifies parsing. The other string replacements just remove white space 172 * and such and are probably unnecessary. This was inspired by the OpenNLP solution which takes in 173 * one line at a time from a file that has been modified in this way. 174 * 175 * @param parse 176 * a String in the treebank format 177 * @return a String in the treebank that has been cleaned up a bit. 178 */ 179 public static String prepareString(String parse) { 180 parse = cleanUpPattern1.matcher(parse).replaceAll(" "); 181 parse = cleanUpPattern2.matcher(parse).replaceAll("(("); 182 parse = cleanUpPattern3.matcher(parse).replaceAll("))"); 183 parse = cleanUpPattern4.matcher(parse).replaceFirst("(TOP ("); 184 return parse.trim(); 185 } 186 187 /** 188 * A treebank parse does not preserve whitespace information. This method provides a simple 189 * mechanism for inferring the original plain text of a treebank parse. If you have access to the 190 * original plain text, then you can bypass use of this method by calling the appropriate parse 191 * method. 192 * 193 * @see #parse(String, String, int) 194 * 195 * @param treebankText 196 * One or more parses in Treebank parenthesized format. 197 * @return a "best" guess of the original plain text given in the parse. 198 */ 199 public static String inferPlainText(String treebankText) { 200 StringBuilder sb = new StringBuilder(); 201 for (String parse : splitSentences(treebankText)) { 202 Matcher matcher = leafNodePattern.matcher(parse); 203 while (matcher.find()) { 204 TreebankNode node = getLeafNode(matcher.group()); 205 if (node.getText() != null && node.getText().length() > 0) { 206 int lastIndex = sb.length() - 1; 207 if (lastIndex > 0 && !needsSpaceBefore(node.getText()) && sb.charAt(lastIndex) == ' ') { 208 sb.deleteCharAt(lastIndex); 209 } 210 sb.append(node.getText()); 211 if (needsSpaceAfter(node.getText())) { 212 sb.append(" "); 213 } 214 } 215 } 216 int lastIndex = sb.length() - 1; 217 if (lastIndex >= 0 && sb.charAt(lastIndex) == ' ') { 218 sb.deleteCharAt(lastIndex); 219 } 220 sb.append('\n'); 221 } 222 return sb.toString().trim(); 223 } 224 225 private static boolean needsSpaceBefore(String tokenText) { 226 String[] noSpaceTokens = new String[] { 227 ".", 228 ",", 229 ":", 230 ";", 231 "?", 232 "'s", 233 "'t", 234 "\"", 235 "!", 236 ")", 237 "]" }; 238 for (String noSpaceToken : noSpaceTokens) { 239 if (tokenText.equals(noSpaceToken)) { 240 return false; 241 } 242 } 243 return true; 244 } 245 246 private static boolean needsSpaceAfter(String tokenText) { 247 String[] noSpaceTokens = new String[] { "\"", "(", "[" }; 248 for (String noSpaceToken : noSpaceTokens) { 249 if (tokenText.equals(noSpaceToken)) { 250 return false; 251 } 252 } 253 return true; 254 } 255 256 /** 257 * Create TreebankNode objects corresponding to the given TreeBank format parse, e.g.: 258 * 259 * <PRE> 260 * ( (X (NP (NP (NML (NN Complex ) (NN trait )) (NN analysis )) (PP (IN of ) (NP (DT the ) (NN mouse ) (NN striatum )))) (: : ) (S (NP-SBJ (JJ independent ) (NNS QTLs )) (VP (VBP modulate ) (NP (NP (NN volume )) (CC and ) (NP (NN neuron ) (NN number)))))) ) 261 * </PRE> 262 * 263 * The text will be inferred automatically from the words in the parse. 264 * 265 * @param parse 266 * A TreeBank formatted parse 267 * @return The TreebankNode root of the parse tree 268 * @see #inferPlainText(String) 269 * @see #parse(String, String, int) 270 */ 271 public static TopTreebankNode parse(String parse) { 272 parse = prepareString(parse); 273 String plainText = inferPlainText(parse).trim(); 274 return parse(parse, plainText, 0); 275 } 276 277 private static void checkText(TreebankNode node, String text) { 278 String text1 = node.getText(); 279 int start = node.getTextBegin(); 280 int end = node.getTextEnd(); 281 String text2 = text.substring(start, end); 282 if (!text1.equals(text2)) { 283 // TreeBank adds in (. .) nodes in odd places, e.g. when a sentence 284 // ends with U.S. (and no final period). As a result, we need to 285 // allow periods to match whitespace and adjust the node bounds. 286 String prefix1 = text1.substring(0, text1.length() - 1); 287 String prefix2 = text2.substring(0, text2.length() - 1); 288 if (text1.endsWith(".") && prefix1.equals(prefix2)) { 289 node.setTextEnd(node.getTextEnd() - 1); 290 } else { 291 throw new IllegalArgumentException( 292 "plain text does not align with tokens in treebank parse. node text = '" + text1 293 + "' plain text = '" + text2 + "'"); 294 } 295 } 296 } 297 298 /** 299 * Create TreebankNode objects corresponding to the given TreeBank format parse, e.g.: 300 * 301 * <PRE> 302 * ( (X (NP (NP (NML (NN Complex ) (NN trait )) (NN analysis )) (PP (IN of ) (NP (DT the ) (NN mouse ) (NN striatum )))) (: : ) (S (NP-SBJ (JJ independent ) (NNS QTLs )) (VP (VBP modulate ) (NP (NP (NN volume )) (CC and ) (NP (NN neuron ) (NN number)))))) ) 303 * </PRE> 304 * 305 * The start and end offsets of each TreebankNode will be aligned to the word offsets in the given 306 * text. 307 * 308 * @param parse 309 * A TreeBank formatted parse 310 * @param text 311 * The text to which the parse should be aligned 312 * @param textOffset 313 * The character offset at which the parse text should start to be aligned. For example, 314 * if the words of the parse start right at the beginning of the text, the appropriate 315 * textOffset is 0. 316 * @return The TreebankNode root of the parse tree. The root node will be a TopTreebankNode, and 317 * all its descendants will be TreebankNodes. 318 * @see TopTreebankNode 319 * @see TreebankNode 320 */ 321 public static TopTreebankNode parse(String parse, String text, int textOffset) { 322 try { 323 TopTreebankNode topNode = new TopTreebankNode(); 324 parse = prepareString(parse); 325 // used to capture the plain text of the sentence. 326 StringBuffer consumedText = new StringBuffer(); 327 if (text != null) { 328 textOffset = movePastWhiteSpaceChars(text, textOffset); 329 consumedText.append(text.substring(0, textOffset)); 330 } 331 332 Stack<Integer> parseOffsetStack = new Stack<Integer>(); 333 Stack<Integer> plainTextOffsetStack = new Stack<Integer>(); 334 335 // keeps the nodes that are waiting for their parents to be completed. 336 Stack<TreebankNode> parseStack = new Stack<TreebankNode>(); 337 338 for (int ci = 0; ci < parse.length(); ci++) { 339 char c = parse.charAt(ci); 340 if (c == '(') { 341 // at the start of each constituent we will push the starting 342 // index of it 343 // w.r.t. the parse string. 344 parseOffsetStack.push(ci); 345 // also push the starting index w.r.t. the plain text of the 346 // sentence. 347 plainTextOffsetStack.push(consumedText.length()); 348 } else if (c == ')') { 349 int begin = parseOffsetStack.pop(); 350 int end = ci; 351 // the portion of the parse string that corresponds to the 352 // constituent that 353 // we found the left bracket for ')'. 354 String subParse = parse.substring(begin, end + 1); 355 356 int textBegin = plainTextOffsetStack.pop(); 357 358 TreebankNode node = getLeafNode(subParse); 359 if (node != null) { 360 node.setTopNode(topNode); 361 node.setParseBegin(begin); 362 node.setParseEnd(end + 1); 363 String token = node.getText(); 364 365 if (token.length() > 0) { 366 int realBegin = movePastWhiteSpaceChars(text, textBegin); 367 consumedText.append(text.substring(textBegin, realBegin)); 368 consumedText.append(token); 369 node.setTextBegin(realBegin); 370 node.setTextEnd(realBegin + token.length()); 371 372 } else { 373 node.setTextBegin(textBegin); 374 node.setTextEnd(textBegin + token.length()); 375 } 376 checkText(node, text); 377 parseStack.push(node); 378 } else { 379 if (parse.lastIndexOf(')') == ci) // the last ')' is the top 380 // node. 381 node = topNode; // this is the instance that will be 382 // returned. 383 else 384 node = new TreebankNode(); 385 node.setTopNode(topNode); 386 node.setParseBegin(begin); 387 node.setParseEnd(end + 1); 388 String type = getType(subParse); 389 node.setType(getTypeFromType(type)); 390 node.setTags(getTagsFromType(type)); 391 node.setLeaf(false); 392 // keep adding the nodes on the stack until it is empty or 393 // the next node on the stack starts before the current node 394 // (i.e. has a different 395 // parent than the current node that will be completed 396 // later.) 397 while (parseStack.size() > 0 398 && parseStack.peek().getParseBegin() > node.getParseBegin()) { 399 TreebankNode child = parseStack.pop(); 400 node.addChild(child); 401 child.setParent(node); 402 } 403 // we typically add a token followed by a space to 404 // plainText, except when the 405 // token is an empty string as it is when the corresponding 406 // type is -NONE- 407 int realBegin = movePastWhiteSpaceChars(text, textBegin); 408 node.setTextBegin(realBegin); 409 node.setTextEnd(Math.max(realBegin, consumedText.length())); 410 411 try { 412 node.setText(consumedText.substring(node.getTextBegin(), node.getTextEnd())); 413 } catch (StringIndexOutOfBoundsException sioobe) { 414 node.setText(""); 415 } 416 checkText(node, text); 417 parseStack.push(node); 418 } 419 } 420 } 421 422 topNode.setTreebankParse(parse); 423 topNode.initTerminalNodes(); 424 return topNode; 425 } catch (RuntimeException e) { 426 throw new IllegalArgumentException("exception thrown when parsing the following: " + parse, e); 427 } 428 } 429 430 private static final Pattern nonwhiteSpaceCharPattern = Pattern.compile("[^\\s]"); 431 432 public static int movePastWhiteSpaceChars(String text, int textOffset) { 433 Matcher matcher = nonwhiteSpaceCharPattern.matcher(text); 434 if (matcher.find(textOffset)) { 435 return matcher.start(); 436 } 437 return textOffset; 438 } 439 440 /** 441 * Replace specially encoded tokens with their original textual representation. 442 * (http://www.cis.upenn.edu/~treebank/tokenization.html) 443 * 444 * @param value 445 * @param type 446 * @return The string in its original textual representation. 447 */ 448 private static String getToken(String value, String type) { 449 value = value.replace("-RCB-", "}"); 450 value = value.replace("-LCB-", "{"); 451 value = value.replace("-RRB-", ")"); 452 value = value.replace("-LRB-", "("); 453 value = value.replace("-RSB-", "]"); 454 value = value.replace("-LSB-", "["); 455 value = value.replace("``", "\""); 456 value = value.replace("''", "\""); 457 458 if (type.equals("-NONE-")) 459 return ""; 460 461 if (value.contains("\\/")) 462 return value.replace("\\/", "/"); 463 464 return value; 465 } 466 467 /** 468 * Generally speaking, we expect one treebanked sentence per line. This method will simply return 469 * the lines of a document assuming that each line has matching parentheses. However, the native 470 * penn treebank data contains parsed sentences that are broken up across multiple lines. Each 471 * sentence in the PTB starts with "( (S..." and so we split on this to get the sentences. If this 472 * method sees "( (S...", then it will return the contents split on that pattern. If not, it will 473 * return the lines of the input string. 474 * 475 * Splits an .mrg file (e.g. wsj/mrg/00/wsj_0020.mrg) into sentence parses. 476 * 477 * @return individual sentence parses from treebank - i.e. strings of the form "( (S..." 478 */ 479 480 public static String[] splitSentences(String mrgContents) { 481 // Splitting on this regular expression can cause the first value 482 // in the array to be an empty string if e.g. the first line of the file 483 // is blank 484 String[] contents = mrgContents.split("(?=\\(\\s*\\()"); 485 if (contents.length > 1) { 486 if (contents.length > 0 && contents[0].trim().equals("")) { 487 String[] returnValues = new String[contents.length - 1]; 488 System.arraycopy(contents, 1, returnValues, 0, returnValues.length); 489 return returnValues; 490 } else { 491 String[] returnValues = new String[contents.length]; 492 System.arraycopy(contents, 0, returnValues, 0, returnValues.length); 493 return returnValues; 494 } 495 } 496 497 String[] lines = mrgContents.split("\r?\n"); 498 for (String line : lines) { 499 if (!parensMatch(line)) { 500 throw new IllegalArgumentException( 501 "Parentheses counts do not match for treebank sentence: " + line); 502 } 503 } 504 return lines; 505 } 506 507 public static boolean parensMatch(String contents) { 508 int leftParenCount = 0; 509 int rightParenCount = 0; 510 511 for (char c : contents.toCharArray()) { 512 if (c == '(') 513 leftParenCount++; 514 if (c == ')') 515 rightParenCount++; 516 } 517 518 return leftParenCount == rightParenCount; 519 520 } 521 522 /** 523 * This method parses an entire documents worth of treebanked sentences. 524 * 525 * @param parse 526 * a single document provided as treebank parenthesized parses 527 * @param textOffset 528 * a value that corresponds to the character offset of the first character of the 529 * document. The appropriate value for this method will typically be 0. 530 * @param text 531 * a single document provided as plain text. If you do not have access to the original 532 * plain text of the document, you can generate some using 533 * {@link #inferPlainText(String)}. 534 */ 535 public static List<TopTreebankNode> parseDocument(String parse, int textOffset, String text) { 536 List<TopTreebankNode> returnValues = new ArrayList<TopTreebankNode>(); 537 String[] sentenceParses = splitSentences(parse); 538 539 for (String sentenceParse : sentenceParses) { 540 TopTreebankNode topNode = parse(sentenceParse, text, textOffset); 541 textOffset = topNode.getTextEnd(); 542 returnValues.add(topNode); 543 } 544 return returnValues; 545 } 546 547}