001/** 002 * Copyright (c) 2007-2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.feature.function; 025 026import java.util.Collections; 027import java.util.List; 028import java.util.regex.Pattern; 029 030import org.cleartk.ml.Feature; 031import org.cleartk.ml.feature.util.NumericTypeUtil; 032 033/** 034 * <br> 035 * Copyright (c) 2007-2012, Regents of the University of Colorado <br> 036 * All rights reserved. 037 * 038 * 039 * @author Philip Ogren 040 * 041 */ 042 043public class NumericTypeFeatureFunction implements FeatureFunction { 044 045 public static final String DEFAULT_NAME = "NumericType"; 046 047 public enum NumericType { 048 DIGITS, // all characters are digits. 049 YEAR_DIGITS, // characters look like a year - i.e. string is 4 digits starting with 1, 20 or 21 050 ALPHANUMERIC, // All characters are either letters or digits 051 SOME_DIGITS, // Contains some digits and some non-letters - but can still contain letters 052 ROMAN_NUMERAL 053 // matches a regular expression for Roman numerals. 054 } 055 056 static Pattern yearDigitsPattern = Pattern.compile("(?:1[0-9]{3,3})|(?:2[0|1][0-9]{2,2})"); 057 058 static Pattern alphanumericPattern = Pattern.compile("[a-zA-Z0-9-]+"); 059 060 static Pattern someLetters = Pattern.compile("[a-zA-Z]"); 061 062 static Pattern romanNumeralPattern = Pattern.compile("^M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)(IX|IV|V?I?I?I?)$"); 063 064 /** 065 * If the value of the feature is a StringValue and is determined to be one of DIGITS, 066 * YEAR_DIGITS, ALPHANUMERIC, SOME_DIGITS, or ROMAN_NUMERAL, then a feature containing one of 067 * those five values is returned. If the value of the feature cannot be characterized by one of 068 * these five values, then an empty list is returned (e.g. the value is an empty string, contains 069 * only white space, or contains only letters, etc.) 070 * 071 * <p> 072 * This method draws heavily from NumericTypeTagger.py written by Steven Bethard. That code 073 * credits <a href="http://diveintopython.org/unit_testing/stage_5.html">Dive Into Python</a> for 074 * the regular expression for matching roman numerals. 075 * 076 * @return a feature that has a value that is one of DIGITS, YEAR_DIGITS, ALPHANUMERIC, 077 * SOME_DIGITS, or ROMAN_NUMERAL. Otherwise an empty list is returned. 078 */ 079 080 @Override 081 public List<Feature> apply(Feature feature) { 082 String featureName = Feature.createName(DEFAULT_NAME, feature.getName()); 083 Object featureValue = feature.getValue(); 084 if (featureValue == null) 085 return Collections.emptyList(); 086 else if (featureValue instanceof String) { 087 String value = featureValue.toString(); 088 if (value == null || value.length() == 0) 089 return Collections.emptyList(); 090 091 if (NumericTypeUtil.isDigits(value)) { 092 if (yearDigitsPattern.matcher(value).matches()) { 093 return Collections.singletonList(new Feature( 094 featureName, 095 NumericType.YEAR_DIGITS.toString())); 096 } else 097 return Collections.singletonList(new Feature(featureName, NumericType.DIGITS.toString())); 098 } else if (NumericTypeUtil.containsDigits(value)) { 099 if (alphanumericPattern.matcher(value).matches() && someLetters.matcher(value).find()) { 100 return Collections.singletonList(new Feature( 101 featureName, 102 NumericType.ALPHANUMERIC.toString())); 103 } else 104 return Collections.singletonList(new Feature( 105 featureName, 106 NumericType.SOME_DIGITS.toString())); 107 } else if (romanNumeralPattern.matcher(value).matches()) { 108 return Collections.singletonList(new Feature( 109 featureName, 110 NumericType.ROMAN_NUMERAL.toString())); 111 } 112 } 113 return Collections.emptyList(); 114 } 115}