001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.feature.function; 025 026import java.util.Collections; 027import java.util.List; 028 029import org.apache.uima.jcas.JCas; 030import org.apache.uima.jcas.tcas.Annotation; 031import org.cleartk.ml.Feature; 032import org.cleartk.ml.feature.extractor.CleartkExtractorException; 033import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1; 034 035/** 036 * A feature extractor that generates a pattern based on the <a 037 * href="http://unicode.org/reports/tr49/">Unicode categories</a> of each of the characters in the 038 * annotation text. For example, "A-z0" is an uppercase letter, followed by a dash, followed by a 039 * lowercase letter, followed by a digit, and so would get the pattern "LuPdLlNd". 040 * 041 * <br> 042 * Copyright (c) 2011, Regents of the University of Colorado <br> 043 * All rights reserved. 044 * 045 * @author Steven Bethard 046 */ 047public class CharacterCategoryPatternFunction<T extends Annotation> implements FeatureFunction { 048 049 /** 050 * The type of pattern to generate in feature values. 051 */ 052 public static enum PatternType { 053 /** 054 * The standard pattern, where one category abbreviation is added to the feature value for each 055 * character in the text. 056 */ 057 ONE_PER_CHAR, 058 /** 059 * A simplified pattern, where if the same category appears many times in a row, the category is 060 * added to the feature value only once. For example "XX00" would get the pattern "LuNd" since 061 * there are two uppercase letters followed by two digits. 062 */ 063 REPEATS_MERGED, 064 /** 065 * Similar to REPEATS_MERGED, but distinguishes between the same category appearing once and 066 * more than once in a row. If the same category appears twice or more in a row, then we will 067 * mark that category with a Kleene plus '+'. For example "X000" would get the pattern "LuNd+" 068 * since there is a single uppercase letter followed by more than one digit. 069 */ 070 REPEATS_AS_KLEENE_PLUS 071 } 072 073 private PatternType patternType; 074 075 private String name; 076 077 public static <T extends Annotation> NamedFeatureExtractor1<T> createExtractor() { 078 return createExtractor(PatternType.ONE_PER_CHAR); 079 } 080 081 /* 082 * I would have returned a simple FeatureFunctionExtractor using the following code: return new 083 * FeatureFunctionExtractor<T>(new CoveredTextExtractor<T>(), false, new 084 * CharacterCategoryPatternFunction<T>()); but TimeAnnotator wanted a NamedFeatureExtractor1. So I 085 * did the following to maintain backwards compatibility. After all, the converted feature 086 * extractor was a NamedFeatureExtractor1. 087 */ 088 089 public static <T extends Annotation> NamedFeatureExtractor1<T> createExtractor( 090 PatternType patternType) { 091 final CharacterCategoryPatternFunction<T> ccpf = new CharacterCategoryPatternFunction<T>( 092 patternType); 093 return new NamedFeatureExtractor1<T>() { 094 095 @Override 096 public List<Feature> extract(JCas view, Annotation focusAnnotation) 097 throws CleartkExtractorException { 098 String text = focusAnnotation.getCoveredText(); 099 return ccpf.apply(new Feature(null, text)); 100 } 101 102 @Override 103 public String getFeatureName() { 104 return ccpf.getFeatureName(); 105 } 106 }; 107 } 108 109 /** 110 * Create the standard feature extractor, where one category is added to the feature value for 111 * each character in the text. See {@link PatternType#ONE_PER_CHAR}. 112 */ 113 public CharacterCategoryPatternFunction() { 114 this(PatternType.ONE_PER_CHAR); 115 } 116 117 /** 118 * Create a feature extractor with the specified pattern type. See {@link PatternType} for the 119 * acceptable pattern types. 120 * 121 * @param patternType 122 * The type of pattern to generate in feature values. 123 */ 124 public CharacterCategoryPatternFunction(PatternType patternType) { 125 this.patternType = patternType; 126 switch (this.patternType) { 127 case ONE_PER_CHAR: 128 this.name = "CharPattern"; 129 break; 130 case REPEATS_MERGED: 131 this.name = "CharPatternRepeatsMerged"; 132 break; 133 case REPEATS_AS_KLEENE_PLUS: 134 this.name = "CharPatternRepeatsAsKleenePlus"; 135 break; 136 } 137 } 138 139 public String getFeatureName() { 140 return this.name; 141 } 142 143 @Override 144 public List<Feature> apply(Feature feature) { 145 String featureName = Feature.createName(getFeatureName(), feature.getName()); 146 Object featureValue = feature.getValue(); 147 if (featureValue == null) 148 return Collections.emptyList(); 149 else if (featureValue instanceof String) { 150 String text = featureValue.toString(); 151 StringBuilder builder = new StringBuilder(); 152 String lastType = null; 153 boolean multipleRepeats = false; 154 for (int i = 0; i < text.length(); i += 1) { 155 char c = text.charAt(i); 156 String type = classifyChar(c); 157 switch (this.patternType) { 158 case ONE_PER_CHAR: 159 builder.append(type); 160 break; 161 case REPEATS_MERGED: 162 if (!type.equals(lastType)) { 163 builder.append(type); 164 } 165 break; 166 case REPEATS_AS_KLEENE_PLUS: 167 if (!type.equals(lastType)) { 168 builder.append(type); 169 multipleRepeats = false; 170 } else if (!multipleRepeats) { 171 builder.append('+'); 172 multipleRepeats = true; 173 } 174 } 175 lastType = type; 176 } 177 return Collections.singletonList(new Feature(featureName, builder.toString())); 178 } 179 return Collections.emptyList(); 180 } 181 182 protected String classifyChar(char c) { 183 int typeInt = Character.getType(c); 184 switch (typeInt) { 185 case Character.CONTROL: 186 return "CC"; 187 case Character.FORMAT: 188 return "Cf"; 189 case Character.UNASSIGNED: 190 return "Cn"; 191 case Character.PRIVATE_USE: 192 return "Co"; 193 case Character.SURROGATE: 194 return "Cs"; 195 case Character.LOWERCASE_LETTER: 196 return "Ll"; 197 case Character.MODIFIER_LETTER: 198 return "Lm"; 199 case Character.OTHER_LETTER: 200 return "Lo"; 201 case Character.TITLECASE_LETTER: 202 return "Lt"; 203 case Character.UPPERCASE_LETTER: 204 return "Lu"; 205 case Character.COMBINING_SPACING_MARK: 206 return "Mc"; 207 case Character.ENCLOSING_MARK: 208 return "Me"; 209 case Character.NON_SPACING_MARK: 210 return "Mn"; 211 case Character.DECIMAL_DIGIT_NUMBER: 212 return "Nd"; 213 case Character.LETTER_NUMBER: 214 return "Nl"; 215 case Character.OTHER_NUMBER: 216 return "No"; 217 case Character.CONNECTOR_PUNCTUATION: 218 return "Pc"; 219 case Character.DASH_PUNCTUATION: 220 return "Pd"; 221 case Character.END_PUNCTUATION: 222 return "Pe"; 223 case Character.FINAL_QUOTE_PUNCTUATION: 224 return "Pf"; 225 case Character.INITIAL_QUOTE_PUNCTUATION: 226 return "Pi"; 227 case Character.OTHER_PUNCTUATION: 228 return "Po"; 229 case Character.START_PUNCTUATION: 230 return "Ps"; 231 case Character.CURRENCY_SYMBOL: 232 return "Sc"; 233 case Character.MODIFIER_SYMBOL: 234 return "Sk"; 235 case Character.MATH_SYMBOL: 236 return "Sm"; 237 case Character.OTHER_SYMBOL: 238 return "So"; 239 case Character.LINE_SEPARATOR: 240 return "Zl"; 241 case Character.PARAGRAPH_SEPARATOR: 242 return "Zp"; 243 case Character.SPACE_SEPARATOR: 244 return "Zs"; 245 default: 246 throw new RuntimeException("Unknown character type: " + typeInt); 247 } 248 } 249}