001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.token.tokenizer; 025 026import java.util.regex.Pattern; 027 028/** 029 * <br> 030 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 031 * All rights reserved. 032 * 033 * 034 * @author Philip Ogren 035 * 036 * This code was derived directly (i.e. translated) from a python script that Steven Bethard 037 * gave me called tokenizer.py. This copyright of this script is owned by the Regents of the 038 * University of Colorado and therefore there is no restriction on our use and distribution 039 * of this derivitive work here. The python script was derived from a sed script available 040 * here: 041 * 042 * http://www.cis.upenn.edu/~treebank/tokenization.html 043 * 044 * Steve's script fixes several misc. errors. 045 * 046 * @author Philip Ogren 047 * 048 */ 049public class PennTreebankTokenizer extends Tokenizer_ImplBase { 050 // different brace type regexes 051 public static String openBracesRegex = "\\[\\(\\{\\<"; 052 053 public static String closedBracesRegex = "\\]\\)\\}\\>"; 054 055 public static String bracesRegex = "([" + openBracesRegex + closedBracesRegex + "])"; 056 057 public static Pattern bracesPattern = Pattern.compile(bracesRegex); 058 059 // ellipsis regex 060 public static String ellipsisRegex = "(" + Pattern.quote("...") + ")"; 061 062 public static Pattern ellipsisPattern = Pattern.compile(ellipsisRegex); 063 064 // comma regex - any comma that is not between two digits 065 public static String commaRegex = "((?<!\\d),|,(?!\\d))"; 066 067 public static Pattern commaPattern = Pattern.compile(commaRegex); 068 069 // dollar sign regex 070 // any dollar sign, potentially preceded by capitals (e.g. US$) 071 public static String dollarSignRegex = "([A-Z]*\\$)"; 072 073 public static Pattern dollarSignPattern = Pattern.compile(dollarSignRegex); 074 075 // ampersand matcher regex 076 // any ampersamd not surrounded by two uppercase letters 077 public static String ampersandRegex = "((?<![A-Z])&|&(?![A-Z]))"; 078 079 public static Pattern ampersandPattern = Pattern.compile(ampersandRegex); 080 081 // dash regex 082 // any set of 2 or more dashes, or any dash followed by whitespace 083 public static String dashRegex = "(--+|-(?=\\s))"; 084 085 public static Pattern dashPattern = Pattern.compile(dashRegex); 086 087 // colon regex 088 // if colon's between digits, then take the digits (e.g. 4:30) 089 public static String colonRegex = "(\\d+:\\d+|:)"; 090 091 public static Pattern colonPattern = Pattern.compile(colonRegex); 092 093 // other punctuation regex 094 // punctuation not followed by a dash (e.g. not 62%-owned) 095 public static String nonFinalPunctRegex = "(``|[|;@#`%])(?!-)"; 096 097 public static Pattern nonFinalPunctPattern = Pattern.compile(nonFinalPunctRegex); 098 099 // period regex 100 // any period ending a sequence of digits, or 101 // any set of exactly two periods, or 102 // any period not preceded by two other periods and followed only 103 // by punctuation to the end of the line 104 public static String periodRegex = "((?<=\\d)\\.(?=[^\\n\\S])|" + "(?<=[^.]\\.)\\.(?![.])|" 105 + "(?<!\\.\\.)\\.[" + closedBracesRegex + "\"'`/_#*\\s]*$)"; 106 107 public static Pattern periodPattern = Pattern.compile(periodRegex, Pattern.MULTILINE); 108 109 // any punctuation that always indicates the end of a sentence 110 public static String nonPeriodPunctRegex = "([?!])"; 111 112 public static Pattern nonPeriodPunctPattern = Pattern.compile(nonPeriodPunctRegex); 113 114 // single quote regex 115 // a single quote preceding digits and an optional s(e.g. '80s), or 116 // any single quote not beside another single quote and that has 117 // whitespace on one side or the other 118 public static String singleQuoteRegex = "('\\d+s?|(?<=\\s)'(?!')|(?<!')'(?=\\s))"; 119 120 public static Pattern singleQuotePattern = Pattern.compile(singleQuoteRegex); 121 122 public static String tripleQuoteRegex = "'''"; 123 124 public static Pattern tripleQuotePattern = Pattern.compile(tripleQuoteRegex); 125 126 public static String doubleQuoteRegex = "''"; 127 128 public static Pattern doubleQuotePattern = Pattern.compile(doubleQuoteRegex); 129 130 public static String quoteRegex = Pattern.quote("\""); 131 132 public static Pattern quotePattern = Pattern.compile(quoteRegex); 133 134 // abbreviation regexes 135 public static String oneWordAbbreviationRegex = "('ll|'re|'ve|n't|'[smd])\\b"; 136 137 public static Pattern oneWordAbbreviationPattern = Pattern.compile( 138 oneWordAbbreviationRegex, 139 Pattern.CASE_INSENSITIVE); 140 141 public static String[] twoWordAbbreviationRegexes = new String[] { 142 "\\b(can)(not)\\b", 143 "\\b(d')(ye)\\b", 144 "\\b(gim)(me)\\b", 145 "\\b(gon)(na)\\b", 146 "\\b(got)(ta)\\b", 147 "\\b(lem)(me)\\b", 148 "\\b(more)('n)\\b", 149 "\\b(wan)(na)\\b" }; 150 151 public static Pattern[] twoWordAbbreviationPatterns = new Pattern[twoWordAbbreviationRegexes.length]; 152 static { 153 for (int i = 0; i < twoWordAbbreviationRegexes.length; i++) { 154 twoWordAbbreviationPatterns[i] = Pattern.compile( 155 twoWordAbbreviationRegexes[i], 156 Pattern.CASE_INSENSITIVE); 157 } 158 } 159 160 public static String[] threeWordAbbreviationRegexes = new String[] { 161 "\\b(wha)(dd)(ya)\\b", 162 "\\b(wha)(t)(cha)\\b" }; 163 164 public static Pattern[] threeWordAbbreviationPatterns = new Pattern[threeWordAbbreviationRegexes.length]; 165 static { 166 for (int i = 0; i < threeWordAbbreviationRegexes.length; i++) { 167 threeWordAbbreviationPatterns[i] = Pattern.compile( 168 threeWordAbbreviationRegexes[i], 169 Pattern.CASE_INSENSITIVE); 170 } 171 } 172 173 public static String tAbbreviationRegex = "('t)(is|was)\\b"; 174 175 public static Pattern tAbbreviationPattern = Pattern.compile(tAbbreviationRegex); 176 177 // space regexes 178 public static String beginOrEndRegex = "^|$"; 179 180 public static Pattern beginOrEndPattern = Pattern.compile(beginOrEndRegex, Pattern.MULTILINE); 181 182 public static String extraSpaceRegex = "^(\\s+)|(\\s+)$|(?<=[ \\t])[ \\t]+"; 183 184 public static Pattern extraSpacePattern = Pattern.compile(extraSpaceRegex, Pattern.MULTILINE); 185 186 public static String multipleWhitespaceRegex = "(\\s+)"; 187 188 public static Pattern multipleWhitespacePattern = Pattern.compile( 189 multipleWhitespaceRegex, 190 Pattern.MULTILINE); 191 192 protected Pattern[] patterns; 193 194 public PennTreebankTokenizer() { 195 patterns = new Pattern[] { 196 ellipsisPattern, 197 commaPattern, 198 dollarSignPattern, 199 ampersandPattern, 200 dashPattern, 201 colonPattern, 202 nonFinalPunctPattern, 203 periodPattern, 204 nonPeriodPunctPattern, 205 bracesPattern }; 206 } 207 208 /** 209 * Tokenizes the input text and returns a string array corresponding to the tokens. 210 */ 211 public String[] getTokenTexts(String text) { 212 for (Pattern pattern : patterns) { 213 text = pattern.matcher(text).replaceAll(" $1 "); 214 } 215 216 text = beginOrEndPattern.matcher(text).replaceAll(" "); 217 218 text = tripleQuotePattern.matcher(text).replaceAll(" ' '' "); 219 text = doubleQuotePattern.matcher(text).replaceAll(" '' "); 220 text = singleQuotePattern.matcher(text).replaceAll(" $1 "); 221 text = quotePattern.matcher(text).replaceAll(" \" "); 222 223 text = oneWordAbbreviationPattern.matcher(text).replaceAll(" $1"); 224 for (Pattern pattern : twoWordAbbreviationPatterns) 225 text = pattern.matcher(text).replaceAll(" $1 $2"); 226 text = tAbbreviationPattern.matcher(text).replaceAll(" $1 $2"); 227 for (Pattern pattern : threeWordAbbreviationPatterns) 228 text = pattern.matcher(text).replaceAll(" $1 $2 $3"); 229 230 text = extraSpacePattern.matcher(text).replaceAll(""); 231 232 // this was added because one of beginOrEndPattern or extraSpacePattern do not seem to be 233 // working correctly. 234 text = multipleWhitespacePattern.matcher(text).replaceAll(" "); 235 236 String[] tokens = text.toString().split(" "); 237 if (tokens.length == 1 && tokens[0].equals("")) { 238 tokens = new String[0]; 239 } 240 return tokens; 241 } 242 243}