001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.util.treebank;
025
026import java.util.ArrayList;
027import java.util.List;
028import java.util.Stack;
029import java.util.regex.Matcher;
030import java.util.regex.Pattern;
031
032/**
033 * This class was written to be a stand alone parser for the Penn Treebank data. Basically, I need a
034 * way to synch up the propbank data with extents of plain text that are labeled. This is not
035 * possible to do without parsing the treebank data first. The parse method will parse a single
036 * sentence from the treebank data from e.g. wsj/mrg/06/wsj_0656.mrg.
037 * 
038 * I initially looked at the OpenNLP treebank parser but they made a few assumptions about they
039 * wanted to keep for the parser that would make it difficult to align with the propbank data. See:
040 * http://sourceforge.net/projects/opennlp/forums/forum/9943/topic/1751983 for relevant discussion.
041 * I looked at their parsing implementation and tried to modify it. However, I think the code below
042 * bears little resemblance to theirs. But there may yet be some snippets taken directly out of that
043 * code. The two regular expressions used are very similar.
044 * 
045 * <br>
046 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
047 * All rights reserved.
048 * 
049 * 
050 * @author Philip Ogren
051 */
052public class TreebankFormatParser {
053  /**
054   * used to identify tokens in Penn Treebank labeled constituents. It will match strings such as:
055   * <ul>
056   * <li>(NNP Community)
057   * <li>(: --)
058   * <li>(-NONE- *U*)
059   * </ul>
060   */
061  public static final String LEAF_NODE_REGEX = "\\(([^( )]+) ([^( )]+)\\s*\\)";
062
063  private static Pattern leafNodePattern = Pattern.compile(LEAF_NODE_REGEX);
064
065  /**
066   * Uses the leafNodePattern to identify a string as a terminal. Examples:
067   * <ul>
068   * <li>parseFragment = "(NNP Community)", returns a leaf node
069   * <li>parseFragment = "(QP ($ $) (CD 107) (CD million) )", returns null
070   * </ul>
071   * 
072   * @param parseFragment
073   *          some fragment of a treebank parse.
074   * @return if the string matches, then a node will be returned. Otherwise, null is returned.
075   */
076  public static TreebankNode getLeafNode(String parseFragment) {
077    Matcher leafNodeMatcher = leafNodePattern.matcher(parseFragment);
078    if (leafNodeMatcher.matches()) {
079      String type = leafNodeMatcher.group(1);
080      String value = leafNodeMatcher.group(2);
081      TreebankNode node = new TreebankNode();
082      node.setType(getTypeFromType(type));
083      node.setTags(getTagsFromType(type));
084      node.setValue(value);
085      node.setLeaf(true);
086      // the token and the value will almost always be the same except for
087      // a few special values (e.g. "-RCB-", "-LCB-", and "-NONE-")
088      String token = getToken(node.getValue(), node.getType());
089      node.setText(token);
090      return node;
091    }
092    return null;
093  }
094
095  private static String getTypeFromType(String fullType) {
096    if (fullType.startsWith("-"))
097      return fullType.substring(0, fullType.indexOf('-', 1) + 1);
098
099    return fullType.split("[-=]")[0];
100  }
101
102  private static String[] getTagsFromType(String fullType) {
103    if (fullType.startsWith("-")) {
104      String rest = fullType.substring(fullType.indexOf('-', 1) + 1);
105      if (rest.length() > 0)
106        return rest.split("[-=]");
107      else
108        return new String[0];
109    } else {
110      String[] parts = fullType.split("[-=]");
111      String[] tags = new String[parts.length - 1];
112
113      for (int i = 1; i < parts.length; i++)
114        tags[i - 1] = parts[i];
115      return tags;
116    }
117  }
118
119  /**
120   * used to identify the type of a consituent in a treebank parse tree. It will match strings such
121   * as:
122   * <ul>
123   * <li>"NNP" in "(NNP Community)"
124   * <li>":" in "(: --)"
125   * <li>"-NONE-" in "(-NONE- *U*)"
126   * </ul>
127   */
128  public static final String TYPE_REGEX = "^\\(([^() ]+)";
129
130  private static Pattern typePattern = Pattern.compile(TYPE_REGEX);
131
132  /**
133   * Returns the type of a constituent of some fragment of a treebank parse. Assumes that the first
134   * character is a parenthesis. Examples:
135   * <ul>
136   * <li>parseFragment = "(NP-LOC (NNP Calif.) )" return = "NP-LOC"
137   * <li>parseFragment = "(NP" return "NP"
138   * <li>parseFragment = "(-NONE- *U*) ) (PP (IN of)" return = "-NONE-"
139   * </ul>
140   * 
141   * @param parseFragment
142   *          some fragment of a treebank parse
143   * @return the type of the constituent.
144   */
145  public static String getType(String parseFragment) {
146    Matcher typeMatcher = typePattern.matcher(parseFragment);
147    if (typeMatcher.find())
148      return typeMatcher.group(1);
149    return null;
150  }
151
152  public static final String cleanUPRegex1 = "\\s+";
153
154  private static final Pattern cleanUpPattern1 = Pattern.compile(cleanUPRegex1, Pattern.MULTILINE);
155
156  public static final String cleanUPRegex2 = "\\( \\(";
157
158  private static final Pattern cleanUpPattern2 = Pattern.compile(cleanUPRegex2, Pattern.MULTILINE);
159
160  public static final String cleanUPRegex3 = "\\) \\)";
161
162  private static final Pattern cleanUpPattern3 = Pattern.compile(cleanUPRegex3, Pattern.MULTILINE);
163
164  public static final String cleanUPRegex4 = "\\s*\\(\\s*\\(";
165
166  private static final Pattern cleanUpPattern4 = Pattern.compile(cleanUPRegex4, Pattern.MULTILINE);
167
168  /**
169   * This method was created simply as a way to clean up the parse string for a sentence in the
170   * treebank syntax. The most important thing that it does is add a type called TOP to the top node
171   * of the sentence. This simplifies parsing. The other string replacements just remove white space
172   * and such and are probably unnecessary. This was inspired by the OpenNLP solution which takes in
173   * one line at a time from a file that has been modified in this way.
174   * 
175   * @param parse
176   *          a String in the treebank format
177   * @return a String in the treebank that has been cleaned up a bit.
178   */
179  public static String prepareString(String parse) {
180    parse = cleanUpPattern1.matcher(parse).replaceAll(" ");
181    parse = cleanUpPattern2.matcher(parse).replaceAll("((");
182    parse = cleanUpPattern3.matcher(parse).replaceAll("))");
183    parse = cleanUpPattern4.matcher(parse).replaceFirst("(TOP (");
184    return parse.trim();
185  }
186
187  /**
188   * A treebank parse does not preserve whitespace information. This method provides a simple
189   * mechanism for inferring the original plain text of a treebank parse. If you have access to the
190   * original plain text, then you can bypass use of this method by calling the appropriate parse
191   * method.
192   * 
193   * @see #parse(String, String, int)
194   * 
195   * @param treebankText
196   *          One or more parses in Treebank parenthesized format.
197   * @return a "best" guess of the original plain text given in the parse.
198   */
199  public static String inferPlainText(String treebankText) {
200    StringBuilder sb = new StringBuilder();
201    for (String parse : splitSentences(treebankText)) {
202      Matcher matcher = leafNodePattern.matcher(parse);
203      while (matcher.find()) {
204        TreebankNode node = getLeafNode(matcher.group());
205        if (node.getText() != null && node.getText().length() > 0) {
206          int lastIndex = sb.length() - 1;
207          if (lastIndex > 0 && !needsSpaceBefore(node.getText()) && sb.charAt(lastIndex) == ' ') {
208            sb.deleteCharAt(lastIndex);
209          }
210          sb.append(node.getText());
211          if (needsSpaceAfter(node.getText())) {
212            sb.append(" ");
213          }
214        }
215      }
216      int lastIndex = sb.length() - 1;
217      if (lastIndex >= 0 && sb.charAt(lastIndex) == ' ') {
218        sb.deleteCharAt(lastIndex);
219      }
220      sb.append('\n');
221    }
222    return sb.toString().trim();
223  }
224
225  private static boolean needsSpaceBefore(String tokenText) {
226    String[] noSpaceTokens = new String[] {
227        ".",
228        ",",
229        ":",
230        ";",
231        "?",
232        "'s",
233        "'t",
234        "\"",
235        "!",
236        ")",
237        "]" };
238    for (String noSpaceToken : noSpaceTokens) {
239      if (tokenText.equals(noSpaceToken)) {
240        return false;
241      }
242    }
243    return true;
244  }
245
246  private static boolean needsSpaceAfter(String tokenText) {
247    String[] noSpaceTokens = new String[] { "\"", "(", "[" };
248    for (String noSpaceToken : noSpaceTokens) {
249      if (tokenText.equals(noSpaceToken)) {
250        return false;
251      }
252    }
253    return true;
254  }
255
256  /**
257   * Create TreebankNode objects corresponding to the given TreeBank format parse, e.g.:
258   * 
259   * <PRE>
260   * ( (X (NP (NP (NML (NN Complex ) (NN trait )) (NN analysis )) (PP (IN of ) (NP (DT the ) (NN mouse ) (NN striatum )))) (: : ) (S (NP-SBJ (JJ independent ) (NNS QTLs )) (VP (VBP modulate ) (NP (NP (NN volume )) (CC and ) (NP (NN neuron ) (NN number)))))) )
261   * </PRE>
262   * 
263   * The text will be inferred automatically from the words in the parse.
264   * 
265   * @param parse
266   *          A TreeBank formatted parse
267   * @return The TreebankNode root of the parse tree
268   * @see #inferPlainText(String)
269   * @see #parse(String, String, int)
270   */
271  public static TopTreebankNode parse(String parse) {
272    parse = prepareString(parse);
273    String plainText = inferPlainText(parse).trim();
274    return parse(parse, plainText, 0);
275  }
276
277  private static void checkText(TreebankNode node, String text) {
278    String text1 = node.getText();
279    int start = node.getTextBegin();
280    int end = node.getTextEnd();
281    String text2 = text.substring(start, end);
282    if (!text1.equals(text2)) {
283      // TreeBank adds in (. .) nodes in odd places, e.g. when a sentence
284      // ends with U.S. (and no final period). As a result, we need to
285      // allow periods to match whitespace and adjust the node bounds.
286      String prefix1 = text1.substring(0, text1.length() - 1);
287      String prefix2 = text2.substring(0, text2.length() - 1);
288      if (text1.endsWith(".") && prefix1.equals(prefix2)) {
289        node.setTextEnd(node.getTextEnd() - 1);
290      } else {
291        throw new IllegalArgumentException(
292            "plain text does not align with tokens in treebank parse.  node text = '" + text1
293                + "'  plain text = '" + text2 + "'");
294      }
295    }
296  }
297
298  /**
299   * Create TreebankNode objects corresponding to the given TreeBank format parse, e.g.:
300   * 
301   * <PRE>
302   * ( (X (NP (NP (NML (NN Complex ) (NN trait )) (NN analysis )) (PP (IN of ) (NP (DT the ) (NN mouse ) (NN striatum )))) (: : ) (S (NP-SBJ (JJ independent ) (NNS QTLs )) (VP (VBP modulate ) (NP (NP (NN volume )) (CC and ) (NP (NN neuron ) (NN number)))))) )
303   * </PRE>
304   * 
305   * The start and end offsets of each TreebankNode will be aligned to the word offsets in the given
306   * text.
307   * 
308   * @param parse
309   *          A TreeBank formatted parse
310   * @param text
311   *          The text to which the parse should be aligned
312   * @param textOffset
313   *          The character offset at which the parse text should start to be aligned. For example,
314   *          if the words of the parse start right at the beginning of the text, the appropriate
315   *          textOffset is 0.
316   * @return The TreebankNode root of the parse tree. The root node will be a TopTreebankNode, and
317   *         all its descendants will be TreebankNodes.
318   * @see TopTreebankNode
319   * @see TreebankNode
320   */
321  public static TopTreebankNode parse(String parse, String text, int textOffset) {
322    try {
323      TopTreebankNode topNode = new TopTreebankNode();
324      parse = prepareString(parse);
325      // used to capture the plain text of the sentence.
326      StringBuffer consumedText = new StringBuffer();
327      if (text != null) {
328        textOffset = movePastWhiteSpaceChars(text, textOffset);
329        consumedText.append(text.substring(0, textOffset));
330      }
331
332      Stack<Integer> parseOffsetStack = new Stack<Integer>();
333      Stack<Integer> plainTextOffsetStack = new Stack<Integer>();
334
335      // keeps the nodes that are waiting for their parents to be completed.
336      Stack<TreebankNode> parseStack = new Stack<TreebankNode>();
337
338      for (int ci = 0; ci < parse.length(); ci++) {
339        char c = parse.charAt(ci);
340        if (c == '(') {
341          // at the start of each constituent we will push the starting
342          // index of it
343          // w.r.t. the parse string.
344          parseOffsetStack.push(ci);
345          // also push the starting index w.r.t. the plain text of the
346          // sentence.
347          plainTextOffsetStack.push(consumedText.length());
348        } else if (c == ')') {
349          int begin = parseOffsetStack.pop();
350          int end = ci;
351          // the portion of the parse string that corresponds to the
352          // constituent that
353          // we found the left bracket for ')'.
354          String subParse = parse.substring(begin, end + 1);
355
356          int textBegin = plainTextOffsetStack.pop();
357
358          TreebankNode node = getLeafNode(subParse);
359          if (node != null) {
360            node.setTopNode(topNode);
361            node.setParseBegin(begin);
362            node.setParseEnd(end + 1);
363            String token = node.getText();
364
365            if (token.length() > 0) {
366              int realBegin = movePastWhiteSpaceChars(text, textBegin);
367              consumedText.append(text.substring(textBegin, realBegin));
368              consumedText.append(token);
369              node.setTextBegin(realBegin);
370              node.setTextEnd(realBegin + token.length());
371
372            } else {
373              node.setTextBegin(textBegin);
374              node.setTextEnd(textBegin + token.length());
375            }
376            checkText(node, text);
377            parseStack.push(node);
378          } else {
379            if (parse.lastIndexOf(')') == ci) // the last ')' is the top
380              // node.
381              node = topNode; // this is the instance that will be
382            // returned.
383            else
384              node = new TreebankNode();
385            node.setTopNode(topNode);
386            node.setParseBegin(begin);
387            node.setParseEnd(end + 1);
388            String type = getType(subParse);
389            node.setType(getTypeFromType(type));
390            node.setTags(getTagsFromType(type));
391            node.setLeaf(false);
392            // keep adding the nodes on the stack until it is empty or
393            // the next node on the stack starts before the current node
394            // (i.e. has a different
395            // parent than the current node that will be completed
396            // later.)
397            while (parseStack.size() > 0
398                && parseStack.peek().getParseBegin() > node.getParseBegin()) {
399              TreebankNode child = parseStack.pop();
400              node.addChild(child);
401              child.setParent(node);
402            }
403            // we typically add a token followed by a space to
404            // plainText, except when the
405            // token is an empty string as it is when the corresponding
406            // type is -NONE-
407            int realBegin = movePastWhiteSpaceChars(text, textBegin);
408            node.setTextBegin(realBegin);
409            node.setTextEnd(Math.max(realBegin, consumedText.length()));
410
411            try {
412              node.setText(consumedText.substring(node.getTextBegin(), node.getTextEnd()));
413            } catch (StringIndexOutOfBoundsException sioobe) {
414              node.setText("");
415            }
416            checkText(node, text);
417            parseStack.push(node);
418          }
419        }
420      }
421
422      topNode.setTreebankParse(parse);
423      topNode.initTerminalNodes();
424      return topNode;
425    } catch (RuntimeException e) {
426      throw new IllegalArgumentException("exception thrown when parsing the following: " + parse, e);
427    }
428  }
429
430  private static final Pattern nonwhiteSpaceCharPattern = Pattern.compile("[^\\s]");
431
432  public static int movePastWhiteSpaceChars(String text, int textOffset) {
433    Matcher matcher = nonwhiteSpaceCharPattern.matcher(text);
434    if (matcher.find(textOffset)) {
435      return matcher.start();
436    }
437    return textOffset;
438  }
439
440  /**
441   * Replace specially encoded tokens with their original textual representation.
442   * (http://www.cis.upenn.edu/~treebank/tokenization.html)
443   * 
444   * @param value
445   * @param type
446   * @return The string in its original textual representation.
447   */
448  private static String getToken(String value, String type) {
449    value = value.replace("-RCB-", "}");
450    value = value.replace("-LCB-", "{");
451    value = value.replace("-RRB-", ")");
452    value = value.replace("-LRB-", "(");
453    value = value.replace("-RSB-", "]");
454    value = value.replace("-LSB-", "[");
455    value = value.replace("``", "\"");
456    value = value.replace("''", "\"");
457
458    if (type.equals("-NONE-"))
459      return "";
460
461    if (value.contains("\\/"))
462      return value.replace("\\/", "/");
463
464    return value;
465  }
466
467  /**
468   * Generally speaking, we expect one treebanked sentence per line. This method will simply return
469   * the lines of a document assuming that each line has matching parentheses. However, the native
470   * penn treebank data contains parsed sentences that are broken up across multiple lines. Each
471   * sentence in the PTB starts with "( (S..." and so we split on this to get the sentences. If this
472   * method sees "( (S...", then it will return the contents split on that pattern. If not, it will
473   * return the lines of the input string.
474   * 
475   * Splits an .mrg file (e.g. wsj/mrg/00/wsj_0020.mrg) into sentence parses.
476   * 
477   * @return individual sentence parses from treebank - i.e. strings of the form "( (S..."
478   */
479
480  public static String[] splitSentences(String mrgContents) {
481    // Splitting on this regular expression can cause the first value
482    // in the array to be an empty string if e.g. the first line of the file
483    // is blank
484    String[] contents = mrgContents.split("(?=\\(\\s*\\()");
485    if (contents.length > 1) {
486      if (contents.length > 0 && contents[0].trim().equals("")) {
487        String[] returnValues = new String[contents.length - 1];
488        System.arraycopy(contents, 1, returnValues, 0, returnValues.length);
489        return returnValues;
490      } else {
491        String[] returnValues = new String[contents.length];
492        System.arraycopy(contents, 0, returnValues, 0, returnValues.length);
493        return returnValues;
494      }
495    }
496
497    String[] lines = mrgContents.split("\r?\n");
498    for (String line : lines) {
499      if (!parensMatch(line)) {
500        throw new IllegalArgumentException(
501            "Parentheses counts do not match for treebank sentence: " + line);
502      }
503    }
504    return lines;
505  }
506
507  public static boolean parensMatch(String contents) {
508    int leftParenCount = 0;
509    int rightParenCount = 0;
510
511    for (char c : contents.toCharArray()) {
512      if (c == '(')
513        leftParenCount++;
514      if (c == ')')
515        rightParenCount++;
516    }
517
518    return leftParenCount == rightParenCount;
519
520  }
521
522  /**
523   * This method parses an entire documents worth of treebanked sentences.
524   * 
525   * @param parse
526   *          a single document provided as treebank parenthesized parses
527   * @param textOffset
528   *          a value that corresponds to the character offset of the first character of the
529   *          document. The appropriate value for this method will typically be 0.
530   * @param text
531   *          a single document provided as plain text. If you do not have access to the original
532   *          plain text of the document, you can generate some using
533   *          {@link #inferPlainText(String)}.
534   */
535  public static List<TopTreebankNode> parseDocument(String parse, int textOffset, String text) {
536    List<TopTreebankNode> returnValues = new ArrayList<TopTreebankNode>();
537    String[] sentenceParses = splitSentences(parse);
538
539    for (String sentenceParse : sentenceParses) {
540      TopTreebankNode topNode = parse(sentenceParse, text, textOffset);
541      textOffset = topNode.getTextEnd();
542      returnValues.add(topNode);
543    }
544    return returnValues;
545  }
546
547}