001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.feature.extractor; 025 026import java.util.ArrayList; 027import java.util.Arrays; 028import java.util.HashSet; 029import java.util.List; 030import java.util.Set; 031import java.util.logging.Logger; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.apache.uima.cas.ArrayFS; 036import org.apache.uima.cas.Feature; 037import org.apache.uima.cas.FeatureStructure; 038import org.apache.uima.cas.Type; 039import org.apache.uima.cas.TypeSystem; 040import org.apache.uima.jcas.JCas; 041import org.apache.uima.jcas.cas.BooleanArray; 042import org.apache.uima.jcas.cas.ByteArray; 043import org.apache.uima.jcas.cas.DoubleArray; 044import org.apache.uima.jcas.cas.FSArray; 045import org.apache.uima.jcas.cas.FloatArray; 046import org.apache.uima.jcas.cas.IntegerArray; 047import org.apache.uima.jcas.cas.LongArray; 048import org.apache.uima.jcas.cas.ShortArray; 049import org.apache.uima.jcas.cas.StringArray; 050import org.apache.uima.jcas.tcas.Annotation; 051import org.cleartk.ml.feature.TypePathFeature; 052import org.apache.uima.fit.util.JCasUtil; 053 054/** 055 * <br> 056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 057 * All rights reserved. 058 * 059 * <p> 060 * 061 * @author Philip Ogren 062 * 063 * TODO handle cases where the pathValue is not a String so that they are not necessarily 064 * converted to strings 065 */ 066 067public class TypePathExtractor<T extends Annotation> implements NamedFeatureExtractor1<T> { 068 069 String featureName; 070 071 Class<? extends Annotation> focusClass; 072 073 Type type; 074 075 String path; 076 077 boolean allPaths; 078 079 boolean allValues; 080 081 boolean uniqueValues; 082 083 boolean pathChecked = false; 084 085 TypeSystem typeSystem; 086 087 Logger logger = Logger.getLogger(TypePathExtractor.class.getName()); 088 089 /** 090 * This extractor creates features from attributes of an annotation. For example, if you had a 091 * type called Token with an attribute called 'pos' you could use this extractor to extract a pos 092 * attribute for tokens. This would be done by calling the constructor with the values "pos", 093 * "pos", false, and false. 094 * 095 * The value you want may be nested within the type structure of the annotation that is being 096 * examined. For example, if you might have a type called 'NamedEntity' that has an attribute of 097 * type 'ResourceEntry' that has an attribute of type 'DbRecord' that has a String attribute 098 * called 'identifier'. You may want to extract the value of the identifier as a feature of the 099 * NamedEntity annotation. This could be done by calling the constructor with the values 100 * "identifier", "resourceEntry/dbRecord/identifier", false, and false (or something similar). 101 * 102 * The value you want for your featured may be multi-valued 103 * 104 * @param focusClass 105 * the type of annotation that you are doing feature extraction on. 106 * @param typePath 107 * a string representation of the path that should be traversed to extract a feature 108 * value (e.g. "resourceEntry/dbRecord/identifier" or "pos" from the examples above.) 109 * @param traverseAllPaths 110 * The path you traverse to the value you are trying to retrieve may include attributes 111 * that have multiple-values. If true, then all paths are examined and features for each 112 * traversal are added if possible. If false, then the first path that results in a 113 * non-null value will be examined. 114 * @param returnAllValues 115 * The last node of the path may be multi-valued. If true, then all values found in the 116 * last node will be returned as features. If false, then only the first value of the 117 * last node is returned. If traverseAllPaths and returnAllValues are both false, then a 118 * list of size 0 or 1 will be returned. The other three combinations are each valid and 119 * may return a list of size 0 or greater. 120 * @param uniqueValues 121 * if true, then the returned values of the extract method will be unique. 122 */ 123 124 public TypePathExtractor( 125 Class<T> focusClass, 126 String typePath, 127 boolean traverseAllPaths, 128 boolean returnAllValues, 129 boolean uniqueValues) { 130 this.featureName = createName(null, typePath); 131 this.focusClass = focusClass; 132 this.path = typePath; 133 this.allPaths = traverseAllPaths; 134 this.allValues = returnAllValues; 135 this.uniqueValues = uniqueValues; 136 } 137 138 private static Pattern pattern = Pattern.compile("/([^/])?"); 139 140 /** 141 * WARNING: this method is public for TypePathFeature backwards compatibility, but should not be 142 * used by anyone else! 143 */ 144 public static String createName(String namePrefix, String typePath) { 145 if (namePrefix == null) 146 namePrefix = "TypePath"; 147 String typePathString = typePath == null ? "" : typePath; 148 149 Matcher matcher = pattern.matcher(typePathString); 150 StringBuffer sb = new StringBuffer(); 151 while (matcher.find()) { 152 if (matcher.group(1) != null) 153 matcher.appendReplacement(sb, matcher.group(1).toUpperCase()); 154 else 155 matcher.appendReplacement(sb, ""); 156 } 157 matcher.appendTail(sb); 158 159 // may not be > 0 if path is "" or "/" 160 if (sb.length() > 0) 161 sb.replace(0, 1, sb.substring(0, 1).toUpperCase()); 162 163 if (sb.length() > 0) { 164 return String.format("%s(%s)", namePrefix, sb.toString()); 165 } else { 166 return null; 167 } 168 } 169 170 /** 171 * calls this(type, typePath, false, false, true, jCas) 172 */ 173 public TypePathExtractor(Class<T> focusClass, String typePath) { 174 this(focusClass, typePath, false, false, true); 175 } 176 177 @Override 178 public String getFeatureName() { 179 return this.featureName; 180 } 181 182 @Override 183 public List<org.cleartk.ml.Feature> extract(JCas view, Annotation focusAnnotation) 184 throws CleartkExtractorException { 185 if (this.type == null) 186 this.type = JCasUtil.getType(view, this.focusClass); 187 188 this.typeSystem = view.getTypeSystem(); 189 190 if (!isValidPath(view)) 191 throw CleartkExtractorException.invalidTypePath(path, type); 192 193 String[] pathMembers = path.split("/"); 194 List<Object> pathValues = new ArrayList<Object>(); 195 _extract(view, focusAnnotation, pathMembers, pathValues); 196 197 List<org.cleartk.ml.Feature> returnValues = new ArrayList<org.cleartk.ml.Feature>(); 198 Set<Object> values = new HashSet<Object>(); 199 for (Object pathValue : pathValues) { 200 if (!uniqueValues || !values.contains(pathValue)) { 201 returnValues.add(new TypePathFeature(null, pathValue, this.path, this.featureName)); 202 values.add(pathValue); 203 } 204 } 205 206 return returnValues; 207 } 208 209 private void _extract( 210 JCas view, 211 FeatureStructure featureStructure, 212 String[] pathMembers, 213 List<Object> pathValues) throws CleartkExtractorException { 214 if (pathMembers.length == 1) { 215 Feature feature = featureStructure.getType().getFeatureByBaseName(pathMembers[0]); 216 if (feature == null) { 217 return; 218 } 219 Type featureType = feature.getRange(); 220 if (featureType.isPrimitive()) { 221 Object pathValue = getPrimitiveFeatureValue(view, featureStructure, feature); 222 if (pathValue != null) 223 pathValues.add(pathValue); 224 } else if (typeSystem.subsumes(typeSystem.getType("uima.tcas.Annotation"), featureType)) { 225 String coveredText = ((Annotation) featureStructure.getFeatureValue(feature)).getCoveredText(); 226 if (coveredText != null) 227 pathValues.add(coveredText); 228 } else if (featureType.isArray()) { 229 Type componentType = featureType.getComponentType(); 230 if (componentType.isPrimitive()) { 231 Object[] values = getPrimitiveArrayFeatureValue(view, featureStructure, feature); 232 if (allValues) 233 pathValues.addAll(Arrays.asList(values)); 234 else if (values.length > 0) 235 pathValues.add(values[0]); 236 } else if (typeSystem.subsumes(typeSystem.getType("uima.tcas.Annotation"), componentType)) { 237 ArrayFS fsArray = (ArrayFS) featureStructure.getFeatureValue(feature); 238 FeatureStructure[] array = fsArray.toArray(); 239 if (allValues) { 240 for (FeatureStructure ftr : array) 241 pathValues.add(((Annotation) ftr).getCoveredText()); 242 } else { 243 if (array.length > 0) 244 pathValues.add(((Annotation) array[0]).getCoveredText()); 245 } 246 } 247 } 248 // TODO else if type is a List type 249 } else { 250 String[] remainingPathMembers = new String[pathMembers.length - 1]; 251 System.arraycopy(pathMembers, 1, remainingPathMembers, 0, pathMembers.length - 1); 252 253 Feature feature = featureStructure.getType().getFeatureByBaseName(pathMembers[0]); 254 FeatureStructure featureValue = featureStructure.getFeatureValue(feature); 255 if (featureValue == null) 256 return; 257 if (featureValue instanceof FSArray) { 258 FSArray fsArray = (FSArray) featureValue; 259 if (allPaths) { 260 for (int i = 0; i < fsArray.size(); i++) { 261 FeatureStructure fs = fsArray.get(i); 262 _extract(view, fs, remainingPathMembers, pathValues); 263 } 264 } else { 265 if (fsArray.size() > 0) 266 _extract(view, fsArray.get(0), remainingPathMembers, pathValues); 267 } 268 } 269 // TODO else if(featureValue instanceof FSList) 270 else { 271 _extract(view, featureValue, remainingPathMembers, pathValues); 272 } 273 } 274 } 275 276 private boolean isValidPath(JCas view) { 277 if (!pathChecked) { 278 boolean validPath = isValidPath(type, path, view); 279 if (validPath) 280 pathChecked = true; 281 return validPath; 282 } else 283 return true; 284 } 285 286 // TODO should be possible to just get the Feature from the type system and 287 // return 288 // true if it is not null. 289 public static boolean isValidPath(Type type, String path, JCas view) { 290 String[] pathMembers = path.split("/"); 291 Type pathMemberType = type; // will be set to type of last path member 292 // feature type 293 for (String pathMember : pathMembers) { 294 Feature feature = pathMemberType.getFeatureByBaseName(pathMember); 295 if (feature == null) { 296 return false; 297 } 298 pathMemberType = feature.getRange(); 299 if (pathMemberType.isArray()) 300 pathMemberType = pathMemberType.getComponentType(); 301 } 302 return isValidType(pathMemberType, view.getTypeSystem()); 303 } 304 305 private static final String[] HANDLED_TYPES = new String[] { 306 "uima.cas.Boolean", 307 "uima.cas.BooleanArray", 308 "uima.cas.Byte", 309 "uima.cas.ByteArray", 310 "uima.cas.Double", 311 "uima.cas.DoubleArray", 312 "uima.cas.Float", 313 "uima.cas.FloatArray", 314 "uima.cas.FloatList", 315 "uima.cas.Integer", 316 "uima.cas.IntegerArray", 317 "uima.cas.IntegerList", 318 "uima.cas.Long", 319 "uima.cas.LongArray", 320 "uima.cas.Short", 321 "uima.cas.ShortArray", 322 "uima.cas.String", 323 "uima.cas.StringArray", 324 "uima.cas.StringList", 325 "uima.tcas.Annotation" }; 326 327 public static boolean isValidType(Type type, TypeSystem typeSystem) { 328 String typeName = type.getName(); 329 for (String handledType : HANDLED_TYPES) { 330 if (typeName.equals(handledType)) 331 return true; 332 } 333 334 // see section 2.3.4 of UIMA References 335 if (typeSystem.subsumes(typeSystem.getType("uima.cas.String"), type)) 336 return true; 337 if (typeSystem.subsumes(typeSystem.getType("uima.tcas.Annotation"), type)) 338 return true; 339 340 return false; 341 } 342 343 /** 344 * see section 4.2.1 of the UIMA References documentation. 345 * 346 * @param view 347 * @param featureStructure 348 * @param feature 349 * @return The feature value. 350 */ 351 private static Object getPrimitiveFeatureValue( 352 JCas view, 353 FeatureStructure featureStructure, 354 Feature feature) throws CleartkExtractorException { 355 TypeSystem typeSystem = view.getTypeSystem(); 356 Type type = feature.getRange(); 357 if (type.equals(typeSystem.getType("uima.cas.Boolean"))) 358 return featureStructure.getBooleanValue(feature); 359 else if (type.equals(typeSystem.getType("uima.cas.Double"))) 360 return featureStructure.getDoubleValue(feature); 361 else if (type.equals(typeSystem.getType("uima.cas.Float"))) 362 return featureStructure.getFloatValue(feature); 363 else if (type.equals(typeSystem.getType("uima.cas.Byte"))) 364 return featureStructure.getByteValue(feature); 365 else if (type.equals(typeSystem.getType("uima.cas.Short"))) 366 return featureStructure.getShortValue(feature); 367 else if (type.equals(typeSystem.getType("uima.cas.Integer"))) 368 return featureStructure.getIntValue(feature); 369 else if (type.equals(typeSystem.getType("uima.cas.Long"))) 370 return featureStructure.getLongValue(feature); 371 else if (type.equals(typeSystem.getType("uima.cas.String"))) 372 return featureStructure.getStringValue(feature); 373 else 374 throw CleartkExtractorException.notPrimitive(feature); 375 } 376 377 private static Object[] getPrimitiveArrayFeatureValue( 378 JCas view, 379 FeatureStructure featureStructure, 380 Feature feature) throws CleartkExtractorException { 381 TypeSystem typeSystem = view.getTypeSystem(); 382 Type type = feature.getRange(); 383 if (type.isArray()) { 384 Type componentType = type.getComponentType(); 385 FeatureStructure featureValue = featureStructure.getFeatureValue(feature); 386 if (componentType.equals(typeSystem.getType("uima.cas.String"))) { 387 return ((StringArray) featureValue).toArray(); 388 } else if (componentType.equals(typeSystem.getType("uima.cas.Boolean"))) { 389 return Arrays.asList(((BooleanArray) featureValue).toArray()).toArray(); 390 } else if (componentType.equals(typeSystem.getType("uima.cas.Double"))) { 391 return Arrays.asList(((DoubleArray) featureValue).toArray()).toArray(); 392 } else if (componentType.equals(typeSystem.getType("uima.cas.Float"))) { 393 return Arrays.asList(((FloatArray) featureValue).toArray()).toArray(); 394 } else if (componentType.equals(typeSystem.getType("uima.cas.Byte"))) { 395 return Arrays.asList(((ByteArray) featureValue).toArray()).toArray(); 396 } else if (componentType.equals(typeSystem.getType("uima.cas.Short"))) { 397 return Arrays.asList(((ShortArray) featureValue).toArray()).toArray(); 398 } else if (componentType.equals(typeSystem.getType("uima.cas.Integer"))) { 399 return Arrays.asList(((IntegerArray) featureValue).toArray()).toArray(); 400 } else if (componentType.equals(typeSystem.getType("uima.cas.Long"))) { 401 return Arrays.asList(((LongArray) featureValue).toArray()).toArray(); 402 } 403 } else 404 throw CleartkExtractorException.notPrimitiveArray(feature); 405 return null; 406 } 407 408 public boolean isAllPaths() { 409 return allPaths; 410 } 411 412 public boolean isAllValues() { 413 return allValues; 414 } 415 416 public Class<? extends Annotation> getFocusClass() { 417 return focusClass; 418 } 419 420 public String getPath() { 421 return path; 422 } 423 424 public boolean isUniqueValues() { 425 return uniqueValues; 426 } 427 428}