001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.token.tokenizer;
025
026import java.util.regex.Pattern;
027
028/**
029 * <br>
030 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
031 * All rights reserved.
032 * 
033 * 
034 * @author Philip Ogren
035 * 
036 *         This code was derived directly (i.e. translated) from a python script that Steven Bethard
037 *         gave me called tokenizer.py. This copyright of this script is owned by the Regents of the
038 *         University of Colorado and therefore there is no restriction on our use and distribution
039 *         of this derivitive work here. The python script was derived from a sed script available
040 *         here:
041 * 
042 *         http://www.cis.upenn.edu/~treebank/tokenization.html
043 * 
044 *         Steve's script fixes several misc. errors.
045 * 
046 * @author Philip Ogren
047 * 
048 */
049public class PennTreebankTokenizer extends Tokenizer_ImplBase {
050  // different brace type regexes
051  public static String openBracesRegex = "\\[\\(\\{\\<";
052
053  public static String closedBracesRegex = "\\]\\)\\}\\>";
054
055  public static String bracesRegex = "([" + openBracesRegex + closedBracesRegex + "])";
056
057  public static Pattern bracesPattern = Pattern.compile(bracesRegex);
058
059  // ellipsis regex
060  public static String ellipsisRegex = "(" + Pattern.quote("...") + ")";
061
062  public static Pattern ellipsisPattern = Pattern.compile(ellipsisRegex);
063
064  // comma regex - any comma that is not between two digits
065  public static String commaRegex = "((?<!\\d),|,(?!\\d))";
066
067  public static Pattern commaPattern = Pattern.compile(commaRegex);
068
069  // dollar sign regex
070  // any dollar sign, potentially preceded by capitals (e.g. US$)
071  public static String dollarSignRegex = "([A-Z]*\\$)";
072
073  public static Pattern dollarSignPattern = Pattern.compile(dollarSignRegex);
074
075  // ampersand matcher regex
076  // any ampersamd not surrounded by two uppercase letters
077  public static String ampersandRegex = "((?<![A-Z])&|&(?![A-Z]))";
078
079  public static Pattern ampersandPattern = Pattern.compile(ampersandRegex);
080
081  // dash regex
082  // any set of 2 or more dashes, or any dash followed by whitespace
083  public static String dashRegex = "(--+|-(?=\\s))";
084
085  public static Pattern dashPattern = Pattern.compile(dashRegex);
086
087  // colon regex
088  // if colon's between digits, then take the digits (e.g. 4:30)
089  public static String colonRegex = "(\\d+:\\d+|:)";
090
091  public static Pattern colonPattern = Pattern.compile(colonRegex);
092
093  // other punctuation regex
094  // punctuation not followed by a dash (e.g. not 62%-owned)
095  public static String nonFinalPunctRegex = "(``|[|;@#`%])(?!-)";
096
097  public static Pattern nonFinalPunctPattern = Pattern.compile(nonFinalPunctRegex);
098
099  // period regex
100  // any period ending a sequence of digits, or
101  // any set of exactly two periods, or
102  // any period not preceded by two other periods and followed only
103  // by punctuation to the end of the line
104  public static String periodRegex = "((?<=\\d)\\.(?=[^\\n\\S])|" + "(?<=[^.]\\.)\\.(?![.])|"
105      + "(?<!\\.\\.)\\.[" + closedBracesRegex + "\"'`/_#*\\s]*$)";
106
107  public static Pattern periodPattern = Pattern.compile(periodRegex, Pattern.MULTILINE);
108
109  // any punctuation that always indicates the end of a sentence
110  public static String nonPeriodPunctRegex = "([?!])";
111
112  public static Pattern nonPeriodPunctPattern = Pattern.compile(nonPeriodPunctRegex);
113
114  // single quote regex
115  // a single quote preceding digits and an optional s(e.g. '80s), or
116  // any single quote not beside another single quote and that has
117  // whitespace on one side or the other
118  public static String singleQuoteRegex = "('\\d+s?|(?<=\\s)'(?!')|(?<!')'(?=\\s))";
119
120  public static Pattern singleQuotePattern = Pattern.compile(singleQuoteRegex);
121
122  public static String tripleQuoteRegex = "'''";
123
124  public static Pattern tripleQuotePattern = Pattern.compile(tripleQuoteRegex);
125
126  public static String doubleQuoteRegex = "''";
127
128  public static Pattern doubleQuotePattern = Pattern.compile(doubleQuoteRegex);
129
130  public static String quoteRegex = Pattern.quote("\"");
131
132  public static Pattern quotePattern = Pattern.compile(quoteRegex);
133
134  // abbreviation regexes
135  public static String oneWordAbbreviationRegex = "('ll|'re|'ve|n't|'[smd])\\b";
136
137  public static Pattern oneWordAbbreviationPattern = Pattern.compile(
138      oneWordAbbreviationRegex,
139      Pattern.CASE_INSENSITIVE);
140
141  public static String[] twoWordAbbreviationRegexes = new String[] {
142      "\\b(can)(not)\\b",
143      "\\b(d')(ye)\\b",
144      "\\b(gim)(me)\\b",
145      "\\b(gon)(na)\\b",
146      "\\b(got)(ta)\\b",
147      "\\b(lem)(me)\\b",
148      "\\b(more)('n)\\b",
149      "\\b(wan)(na)\\b" };
150
151  public static Pattern[] twoWordAbbreviationPatterns = new Pattern[twoWordAbbreviationRegexes.length];
152  static {
153    for (int i = 0; i < twoWordAbbreviationRegexes.length; i++) {
154      twoWordAbbreviationPatterns[i] = Pattern.compile(
155          twoWordAbbreviationRegexes[i],
156          Pattern.CASE_INSENSITIVE);
157    }
158  }
159
160  public static String[] threeWordAbbreviationRegexes = new String[] {
161      "\\b(wha)(dd)(ya)\\b",
162      "\\b(wha)(t)(cha)\\b" };
163
164  public static Pattern[] threeWordAbbreviationPatterns = new Pattern[threeWordAbbreviationRegexes.length];
165  static {
166    for (int i = 0; i < threeWordAbbreviationRegexes.length; i++) {
167      threeWordAbbreviationPatterns[i] = Pattern.compile(
168          threeWordAbbreviationRegexes[i],
169          Pattern.CASE_INSENSITIVE);
170    }
171  }
172
173  public static String tAbbreviationRegex = "('t)(is|was)\\b";
174
175  public static Pattern tAbbreviationPattern = Pattern.compile(tAbbreviationRegex);
176
177  // space regexes
178  public static String beginOrEndRegex = "^|$";
179
180  public static Pattern beginOrEndPattern = Pattern.compile(beginOrEndRegex, Pattern.MULTILINE);
181
182  public static String extraSpaceRegex = "^(\\s+)|(\\s+)$|(?<=[ \\t])[ \\t]+";
183
184  public static Pattern extraSpacePattern = Pattern.compile(extraSpaceRegex, Pattern.MULTILINE);
185
186  public static String multipleWhitespaceRegex = "(\\s+)";
187
188  public static Pattern multipleWhitespacePattern = Pattern.compile(
189      multipleWhitespaceRegex,
190      Pattern.MULTILINE);
191
192  protected Pattern[] patterns;
193
194  public PennTreebankTokenizer() {
195    patterns = new Pattern[] {
196        ellipsisPattern,
197        commaPattern,
198        dollarSignPattern,
199        ampersandPattern,
200        dashPattern,
201        colonPattern,
202        nonFinalPunctPattern,
203        periodPattern,
204        nonPeriodPunctPattern,
205        bracesPattern };
206  }
207
208  /**
209   * Tokenizes the input text and returns a string array corresponding to the tokens.
210   */
211  public String[] getTokenTexts(String text) {
212    for (Pattern pattern : patterns) {
213      text = pattern.matcher(text).replaceAll(" $1 ");
214    }
215
216    text = beginOrEndPattern.matcher(text).replaceAll(" ");
217
218    text = tripleQuotePattern.matcher(text).replaceAll(" ' '' ");
219    text = doubleQuotePattern.matcher(text).replaceAll(" '' ");
220    text = singleQuotePattern.matcher(text).replaceAll(" $1 ");
221    text = quotePattern.matcher(text).replaceAll(" \" ");
222
223    text = oneWordAbbreviationPattern.matcher(text).replaceAll(" $1");
224    for (Pattern pattern : twoWordAbbreviationPatterns)
225      text = pattern.matcher(text).replaceAll(" $1 $2");
226    text = tAbbreviationPattern.matcher(text).replaceAll(" $1 $2");
227    for (Pattern pattern : threeWordAbbreviationPatterns)
228      text = pattern.matcher(text).replaceAll(" $1 $2 $3");
229
230    text = extraSpacePattern.matcher(text).replaceAll("");
231
232    // this was added because one of beginOrEndPattern or extraSpacePattern do not seem to be
233    // working correctly.
234    text = multipleWhitespacePattern.matcher(text).replaceAll(" ");
235
236    String[] tokens = text.toString().split(" ");
237    if (tokens.length == 1 && tokens[0].equals("")) {
238      tokens = new String[0];
239    }
240    return tokens;
241  }
242
243}