001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.feature.extractor;
025
026import java.util.ArrayList;
027import java.util.Arrays;
028import java.util.HashSet;
029import java.util.List;
030import java.util.Set;
031import java.util.logging.Logger;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import org.apache.uima.cas.ArrayFS;
036import org.apache.uima.cas.Feature;
037import org.apache.uima.cas.FeatureStructure;
038import org.apache.uima.cas.Type;
039import org.apache.uima.cas.TypeSystem;
040import org.apache.uima.jcas.JCas;
041import org.apache.uima.jcas.cas.BooleanArray;
042import org.apache.uima.jcas.cas.ByteArray;
043import org.apache.uima.jcas.cas.DoubleArray;
044import org.apache.uima.jcas.cas.FSArray;
045import org.apache.uima.jcas.cas.FloatArray;
046import org.apache.uima.jcas.cas.IntegerArray;
047import org.apache.uima.jcas.cas.LongArray;
048import org.apache.uima.jcas.cas.ShortArray;
049import org.apache.uima.jcas.cas.StringArray;
050import org.apache.uima.jcas.tcas.Annotation;
051import org.cleartk.ml.feature.TypePathFeature;
052import org.apache.uima.fit.util.JCasUtil;
053
054/**
055 * <br>
056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
057 * All rights reserved.
058 * 
059 * <p>
060 * 
061 * @author Philip Ogren
062 * 
063 *         TODO handle cases where the pathValue is not a String so that they are not necessarily
064 *         converted to strings
065 */
066
067public class TypePathExtractor<T extends Annotation> implements NamedFeatureExtractor1<T> {
068
069  String featureName;
070
071  Class<? extends Annotation> focusClass;
072
073  Type type;
074
075  String path;
076
077  boolean allPaths;
078
079  boolean allValues;
080
081  boolean uniqueValues;
082
083  boolean pathChecked = false;
084
085  TypeSystem typeSystem;
086
087  Logger logger = Logger.getLogger(TypePathExtractor.class.getName());
088
089  /**
090   * This extractor creates features from attributes of an annotation. For example, if you had a
091   * type called Token with an attribute called 'pos' you could use this extractor to extract a pos
092   * attribute for tokens. This would be done by calling the constructor with the values "pos",
093   * "pos", false, and false.
094   * 
095   * The value you want may be nested within the type structure of the annotation that is being
096   * examined. For example, if you might have a type called 'NamedEntity' that has an attribute of
097   * type 'ResourceEntry' that has an attribute of type 'DbRecord' that has a String attribute
098   * called 'identifier'. You may want to extract the value of the identifier as a feature of the
099   * NamedEntity annotation. This could be done by calling the constructor with the values
100   * "identifier", "resourceEntry/dbRecord/identifier", false, and false (or something similar).
101   * 
102   * The value you want for your featured may be multi-valued
103   * 
104   * @param focusClass
105   *          the type of annotation that you are doing feature extraction on.
106   * @param typePath
107   *          a string representation of the path that should be traversed to extract a feature
108   *          value (e.g. "resourceEntry/dbRecord/identifier" or "pos" from the examples above.)
109   * @param traverseAllPaths
110   *          The path you traverse to the value you are trying to retrieve may include attributes
111   *          that have multiple-values. If true, then all paths are examined and features for each
112   *          traversal are added if possible. If false, then the first path that results in a
113   *          non-null value will be examined.
114   * @param returnAllValues
115   *          The last node of the path may be multi-valued. If true, then all values found in the
116   *          last node will be returned as features. If false, then only the first value of the
117   *          last node is returned. If traverseAllPaths and returnAllValues are both false, then a
118   *          list of size 0 or 1 will be returned. The other three combinations are each valid and
119   *          may return a list of size 0 or greater.
120   * @param uniqueValues
121   *          if true, then the returned values of the extract method will be unique.
122   */
123
124  public TypePathExtractor(
125      Class<T> focusClass,
126      String typePath,
127      boolean traverseAllPaths,
128      boolean returnAllValues,
129      boolean uniqueValues) {
130    this.featureName = createName(null, typePath);
131    this.focusClass = focusClass;
132    this.path = typePath;
133    this.allPaths = traverseAllPaths;
134    this.allValues = returnAllValues;
135    this.uniqueValues = uniqueValues;
136  }
137
138  private static Pattern pattern = Pattern.compile("/([^/])?");
139
140  /**
141   * WARNING: this method is public for TypePathFeature backwards compatibility, but should not be
142   * used by anyone else!
143   */
144  public static String createName(String namePrefix, String typePath) {
145    if (namePrefix == null)
146      namePrefix = "TypePath";
147    String typePathString = typePath == null ? "" : typePath;
148
149    Matcher matcher = pattern.matcher(typePathString);
150    StringBuffer sb = new StringBuffer();
151    while (matcher.find()) {
152      if (matcher.group(1) != null)
153        matcher.appendReplacement(sb, matcher.group(1).toUpperCase());
154      else
155        matcher.appendReplacement(sb, "");
156    }
157    matcher.appendTail(sb);
158
159    // may not be > 0 if path is "" or "/"
160    if (sb.length() > 0)
161      sb.replace(0, 1, sb.substring(0, 1).toUpperCase());
162
163    if (sb.length() > 0) {
164      return String.format("%s(%s)", namePrefix, sb.toString());
165    } else {
166      return null;
167    }
168  }
169
170  /**
171   * calls this(type, typePath, false, false, true, jCas)
172   */
173  public TypePathExtractor(Class<T> focusClass, String typePath) {
174    this(focusClass, typePath, false, false, true);
175  }
176
177  @Override
178  public String getFeatureName() {
179    return this.featureName;
180  }
181
182  @Override
183  public List<org.cleartk.ml.Feature> extract(JCas view, Annotation focusAnnotation)
184      throws CleartkExtractorException {
185    if (this.type == null)
186      this.type = JCasUtil.getType(view, this.focusClass);
187
188    this.typeSystem = view.getTypeSystem();
189
190    if (!isValidPath(view))
191      throw CleartkExtractorException.invalidTypePath(path, type);
192
193    String[] pathMembers = path.split("/");
194    List<Object> pathValues = new ArrayList<Object>();
195    _extract(view, focusAnnotation, pathMembers, pathValues);
196
197    List<org.cleartk.ml.Feature> returnValues = new ArrayList<org.cleartk.ml.Feature>();
198    Set<Object> values = new HashSet<Object>();
199    for (Object pathValue : pathValues) {
200      if (!uniqueValues || !values.contains(pathValue)) {
201        returnValues.add(new TypePathFeature(null, pathValue, this.path, this.featureName));
202        values.add(pathValue);
203      }
204    }
205
206    return returnValues;
207  }
208
209  private void _extract(
210      JCas view,
211      FeatureStructure featureStructure,
212      String[] pathMembers,
213      List<Object> pathValues) throws CleartkExtractorException {
214    if (pathMembers.length == 1) {
215      Feature feature = featureStructure.getType().getFeatureByBaseName(pathMembers[0]);
216      if (feature == null) {
217        return;
218      }
219      Type featureType = feature.getRange();
220      if (featureType.isPrimitive()) {
221        Object pathValue = getPrimitiveFeatureValue(view, featureStructure, feature);
222        if (pathValue != null)
223          pathValues.add(pathValue);
224      } else if (typeSystem.subsumes(typeSystem.getType("uima.tcas.Annotation"), featureType)) {
225        String coveredText = ((Annotation) featureStructure.getFeatureValue(feature)).getCoveredText();
226        if (coveredText != null)
227          pathValues.add(coveredText);
228      } else if (featureType.isArray()) {
229        Type componentType = featureType.getComponentType();
230        if (componentType.isPrimitive()) {
231          Object[] values = getPrimitiveArrayFeatureValue(view, featureStructure, feature);
232          if (allValues)
233            pathValues.addAll(Arrays.asList(values));
234          else if (values.length > 0)
235            pathValues.add(values[0]);
236        } else if (typeSystem.subsumes(typeSystem.getType("uima.tcas.Annotation"), componentType)) {
237          ArrayFS fsArray = (ArrayFS) featureStructure.getFeatureValue(feature);
238          FeatureStructure[] array = fsArray.toArray();
239          if (allValues) {
240            for (FeatureStructure ftr : array)
241              pathValues.add(((Annotation) ftr).getCoveredText());
242          } else {
243            if (array.length > 0)
244              pathValues.add(((Annotation) array[0]).getCoveredText());
245          }
246        }
247      }
248      // TODO else if type is a List type
249    } else {
250      String[] remainingPathMembers = new String[pathMembers.length - 1];
251      System.arraycopy(pathMembers, 1, remainingPathMembers, 0, pathMembers.length - 1);
252
253      Feature feature = featureStructure.getType().getFeatureByBaseName(pathMembers[0]);
254      FeatureStructure featureValue = featureStructure.getFeatureValue(feature);
255      if (featureValue == null)
256        return;
257      if (featureValue instanceof FSArray) {
258        FSArray fsArray = (FSArray) featureValue;
259        if (allPaths) {
260          for (int i = 0; i < fsArray.size(); i++) {
261            FeatureStructure fs = fsArray.get(i);
262            _extract(view, fs, remainingPathMembers, pathValues);
263          }
264        } else {
265          if (fsArray.size() > 0)
266            _extract(view, fsArray.get(0), remainingPathMembers, pathValues);
267        }
268      }
269      // TODO else if(featureValue instanceof FSList)
270      else {
271        _extract(view, featureValue, remainingPathMembers, pathValues);
272      }
273    }
274  }
275
276  private boolean isValidPath(JCas view) {
277    if (!pathChecked) {
278      boolean validPath = isValidPath(type, path, view);
279      if (validPath)
280        pathChecked = true;
281      return validPath;
282    } else
283      return true;
284  }
285
286  // TODO should be possible to just get the Feature from the type system and
287  // return
288  // true if it is not null.
289  public static boolean isValidPath(Type type, String path, JCas view) {
290    String[] pathMembers = path.split("/");
291    Type pathMemberType = type; // will be set to type of last path member
292    // feature type
293    for (String pathMember : pathMembers) {
294      Feature feature = pathMemberType.getFeatureByBaseName(pathMember);
295      if (feature == null) {
296        return false;
297      }
298      pathMemberType = feature.getRange();
299      if (pathMemberType.isArray())
300        pathMemberType = pathMemberType.getComponentType();
301    }
302    return isValidType(pathMemberType, view.getTypeSystem());
303  }
304
305  private static final String[] HANDLED_TYPES = new String[] {
306      "uima.cas.Boolean",
307      "uima.cas.BooleanArray",
308      "uima.cas.Byte",
309      "uima.cas.ByteArray",
310      "uima.cas.Double",
311      "uima.cas.DoubleArray",
312      "uima.cas.Float",
313      "uima.cas.FloatArray",
314      "uima.cas.FloatList",
315      "uima.cas.Integer",
316      "uima.cas.IntegerArray",
317      "uima.cas.IntegerList",
318      "uima.cas.Long",
319      "uima.cas.LongArray",
320      "uima.cas.Short",
321      "uima.cas.ShortArray",
322      "uima.cas.String",
323      "uima.cas.StringArray",
324      "uima.cas.StringList",
325      "uima.tcas.Annotation" };
326
327  public static boolean isValidType(Type type, TypeSystem typeSystem) {
328    String typeName = type.getName();
329    for (String handledType : HANDLED_TYPES) {
330      if (typeName.equals(handledType))
331        return true;
332    }
333
334    // see section 2.3.4 of UIMA References
335    if (typeSystem.subsumes(typeSystem.getType("uima.cas.String"), type))
336      return true;
337    if (typeSystem.subsumes(typeSystem.getType("uima.tcas.Annotation"), type))
338      return true;
339
340    return false;
341  }
342
343  /**
344   * see section 4.2.1 of the UIMA References documentation.
345   * 
346   * @param view
347   * @param featureStructure
348   * @param feature
349   * @return The feature value.
350   */
351  private static Object getPrimitiveFeatureValue(
352      JCas view,
353      FeatureStructure featureStructure,
354      Feature feature) throws CleartkExtractorException {
355    TypeSystem typeSystem = view.getTypeSystem();
356    Type type = feature.getRange();
357    if (type.equals(typeSystem.getType("uima.cas.Boolean")))
358      return featureStructure.getBooleanValue(feature);
359    else if (type.equals(typeSystem.getType("uima.cas.Double")))
360      return featureStructure.getDoubleValue(feature);
361    else if (type.equals(typeSystem.getType("uima.cas.Float")))
362      return featureStructure.getFloatValue(feature);
363    else if (type.equals(typeSystem.getType("uima.cas.Byte")))
364      return featureStructure.getByteValue(feature);
365    else if (type.equals(typeSystem.getType("uima.cas.Short")))
366      return featureStructure.getShortValue(feature);
367    else if (type.equals(typeSystem.getType("uima.cas.Integer")))
368      return featureStructure.getIntValue(feature);
369    else if (type.equals(typeSystem.getType("uima.cas.Long")))
370      return featureStructure.getLongValue(feature);
371    else if (type.equals(typeSystem.getType("uima.cas.String")))
372      return featureStructure.getStringValue(feature);
373    else
374      throw CleartkExtractorException.notPrimitive(feature);
375  }
376
377  private static Object[] getPrimitiveArrayFeatureValue(
378      JCas view,
379      FeatureStructure featureStructure,
380      Feature feature) throws CleartkExtractorException {
381    TypeSystem typeSystem = view.getTypeSystem();
382    Type type = feature.getRange();
383    if (type.isArray()) {
384      Type componentType = type.getComponentType();
385      FeatureStructure featureValue = featureStructure.getFeatureValue(feature);
386      if (componentType.equals(typeSystem.getType("uima.cas.String"))) {
387        return ((StringArray) featureValue).toArray();
388      } else if (componentType.equals(typeSystem.getType("uima.cas.Boolean"))) {
389        return Arrays.asList(((BooleanArray) featureValue).toArray()).toArray();
390      } else if (componentType.equals(typeSystem.getType("uima.cas.Double"))) {
391        return Arrays.asList(((DoubleArray) featureValue).toArray()).toArray();
392      } else if (componentType.equals(typeSystem.getType("uima.cas.Float"))) {
393        return Arrays.asList(((FloatArray) featureValue).toArray()).toArray();
394      } else if (componentType.equals(typeSystem.getType("uima.cas.Byte"))) {
395        return Arrays.asList(((ByteArray) featureValue).toArray()).toArray();
396      } else if (componentType.equals(typeSystem.getType("uima.cas.Short"))) {
397        return Arrays.asList(((ShortArray) featureValue).toArray()).toArray();
398      } else if (componentType.equals(typeSystem.getType("uima.cas.Integer"))) {
399        return Arrays.asList(((IntegerArray) featureValue).toArray()).toArray();
400      } else if (componentType.equals(typeSystem.getType("uima.cas.Long"))) {
401        return Arrays.asList(((LongArray) featureValue).toArray()).toArray();
402      }
403    } else
404      throw CleartkExtractorException.notPrimitiveArray(feature);
405    return null;
406  }
407
408  public boolean isAllPaths() {
409    return allPaths;
410  }
411
412  public boolean isAllValues() {
413    return allValues;
414  }
415
416  public Class<? extends Annotation> getFocusClass() {
417    return focusClass;
418  }
419
420  public String getPath() {
421    return path;
422  }
423
424  public boolean isUniqueValues() {
425    return uniqueValues;
426  }
427
428}