001/* 
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.feature.function;
025
026import java.util.Collections;
027import java.util.List;
028
029import org.apache.uima.jcas.JCas;
030import org.apache.uima.jcas.tcas.Annotation;
031import org.cleartk.ml.Feature;
032import org.cleartk.ml.feature.extractor.CleartkExtractorException;
033import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1;
034
035/**
036 * A feature extractor that generates a pattern based on the <a
037 * href="http://unicode.org/reports/tr49/">Unicode categories</a> of each of the characters in the
038 * annotation text. For example, "A-z0" is an uppercase letter, followed by a dash, followed by a
039 * lowercase letter, followed by a digit, and so would get the pattern "LuPdLlNd".
040 * 
041 * <br>
042 * Copyright (c) 2011, Regents of the University of Colorado <br>
043 * All rights reserved.
044 * 
045 * @author Steven Bethard
046 */
047public class CharacterCategoryPatternFunction<T extends Annotation> implements FeatureFunction {
048
049  /**
050   * The type of pattern to generate in feature values.
051   */
052  public static enum PatternType {
053    /**
054     * The standard pattern, where one category abbreviation is added to the feature value for each
055     * character in the text.
056     */
057    ONE_PER_CHAR,
058    /**
059     * A simplified pattern, where if the same category appears many times in a row, the category is
060     * added to the feature value only once. For example "XX00" would get the pattern "LuNd" since
061     * there are two uppercase letters followed by two digits.
062     */
063    REPEATS_MERGED,
064    /**
065     * Similar to REPEATS_MERGED, but distinguishes between the same category appearing once and
066     * more than once in a row. If the same category appears twice or more in a row, then we will
067     * mark that category with a Kleene plus '+'. For example "X000" would get the pattern "LuNd+"
068     * since there is a single uppercase letter followed by more than one digit.
069     */
070    REPEATS_AS_KLEENE_PLUS
071  }
072
073  private PatternType patternType;
074
075  private String name;
076
077  public static <T extends Annotation> NamedFeatureExtractor1<T> createExtractor() {
078    return createExtractor(PatternType.ONE_PER_CHAR);
079  }
080
081  /*
082   * I would have returned a simple FeatureFunctionExtractor using the following code: return new
083   * FeatureFunctionExtractor<T>(new CoveredTextExtractor<T>(), false, new
084   * CharacterCategoryPatternFunction<T>()); but TimeAnnotator wanted a NamedFeatureExtractor1. So I
085   * did the following to maintain backwards compatibility. After all, the converted feature
086   * extractor was a NamedFeatureExtractor1.
087   */
088
089  public static <T extends Annotation> NamedFeatureExtractor1<T> createExtractor(
090      PatternType patternType) {
091    final CharacterCategoryPatternFunction<T> ccpf = new CharacterCategoryPatternFunction<T>(
092        patternType);
093    return new NamedFeatureExtractor1<T>() {
094
095      @Override
096      public List<Feature> extract(JCas view, Annotation focusAnnotation)
097          throws CleartkExtractorException {
098        String text = focusAnnotation.getCoveredText();
099        return ccpf.apply(new Feature(null, text));
100      }
101
102      @Override
103      public String getFeatureName() {
104        return ccpf.getFeatureName();
105      }
106    };
107  }
108
109  /**
110   * Create the standard feature extractor, where one category is added to the feature value for
111   * each character in the text. See {@link PatternType#ONE_PER_CHAR}.
112   */
113  public CharacterCategoryPatternFunction() {
114    this(PatternType.ONE_PER_CHAR);
115  }
116
117  /**
118   * Create a feature extractor with the specified pattern type. See {@link PatternType} for the
119   * acceptable pattern types.
120   * 
121   * @param patternType
122   *          The type of pattern to generate in feature values.
123   */
124  public CharacterCategoryPatternFunction(PatternType patternType) {
125    this.patternType = patternType;
126    switch (this.patternType) {
127      case ONE_PER_CHAR:
128        this.name = "CharPattern";
129        break;
130      case REPEATS_MERGED:
131        this.name = "CharPatternRepeatsMerged";
132        break;
133      case REPEATS_AS_KLEENE_PLUS:
134        this.name = "CharPatternRepeatsAsKleenePlus";
135        break;
136    }
137  }
138
139  public String getFeatureName() {
140    return this.name;
141  }
142
143  @Override
144  public List<Feature> apply(Feature feature) {
145    String featureName = Feature.createName(getFeatureName(), feature.getName());
146    Object featureValue = feature.getValue();
147    if (featureValue == null)
148      return Collections.emptyList();
149    else if (featureValue instanceof String) {
150      String text = featureValue.toString();
151      StringBuilder builder = new StringBuilder();
152      String lastType = null;
153      boolean multipleRepeats = false;
154      for (int i = 0; i < text.length(); i += 1) {
155        char c = text.charAt(i);
156        String type = classifyChar(c);
157        switch (this.patternType) {
158          case ONE_PER_CHAR:
159            builder.append(type);
160            break;
161          case REPEATS_MERGED:
162            if (!type.equals(lastType)) {
163              builder.append(type);
164            }
165            break;
166          case REPEATS_AS_KLEENE_PLUS:
167            if (!type.equals(lastType)) {
168              builder.append(type);
169              multipleRepeats = false;
170            } else if (!multipleRepeats) {
171              builder.append('+');
172              multipleRepeats = true;
173            }
174        }
175        lastType = type;
176      }
177      return Collections.singletonList(new Feature(featureName, builder.toString()));
178    }
179    return Collections.emptyList();
180  }
181
182  protected String classifyChar(char c) {
183    int typeInt = Character.getType(c);
184    switch (typeInt) {
185      case Character.CONTROL:
186        return "CC";
187      case Character.FORMAT:
188        return "Cf";
189      case Character.UNASSIGNED:
190        return "Cn";
191      case Character.PRIVATE_USE:
192        return "Co";
193      case Character.SURROGATE:
194        return "Cs";
195      case Character.LOWERCASE_LETTER:
196        return "Ll";
197      case Character.MODIFIER_LETTER:
198        return "Lm";
199      case Character.OTHER_LETTER:
200        return "Lo";
201      case Character.TITLECASE_LETTER:
202        return "Lt";
203      case Character.UPPERCASE_LETTER:
204        return "Lu";
205      case Character.COMBINING_SPACING_MARK:
206        return "Mc";
207      case Character.ENCLOSING_MARK:
208        return "Me";
209      case Character.NON_SPACING_MARK:
210        return "Mn";
211      case Character.DECIMAL_DIGIT_NUMBER:
212        return "Nd";
213      case Character.LETTER_NUMBER:
214        return "Nl";
215      case Character.OTHER_NUMBER:
216        return "No";
217      case Character.CONNECTOR_PUNCTUATION:
218        return "Pc";
219      case Character.DASH_PUNCTUATION:
220        return "Pd";
221      case Character.END_PUNCTUATION:
222        return "Pe";
223      case Character.FINAL_QUOTE_PUNCTUATION:
224        return "Pf";
225      case Character.INITIAL_QUOTE_PUNCTUATION:
226        return "Pi";
227      case Character.OTHER_PUNCTUATION:
228        return "Po";
229      case Character.START_PUNCTUATION:
230        return "Ps";
231      case Character.CURRENCY_SYMBOL:
232        return "Sc";
233      case Character.MODIFIER_SYMBOL:
234        return "Sk";
235      case Character.MATH_SYMBOL:
236        return "Sm";
237      case Character.OTHER_SYMBOL:
238        return "So";
239      case Character.LINE_SEPARATOR:
240        return "Zl";
241      case Character.PARAGRAPH_SEPARATOR:
242        return "Zp";
243      case Character.SPACE_SEPARATOR:
244        return "Zs";
245      default:
246        throw new RuntimeException("Unknown character type: " + typeInt);
247    }
248  }
249}