001/** 002 * Copyright (c) 2007-2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.feature.function; 025 026import java.util.Collections; 027import java.util.List; 028 029import org.cleartk.ml.Feature; 030 031/** 032 * <br> 033 * Copyright (c) 2007-2012, Regents of the University of Colorado <br> 034 * All rights reserved. 035 * 036 * 037 * @author Philip Ogren 038 * 039 */ 040public class CharacterNgramFeatureFunction implements FeatureFunction { 041 042 public enum Orientation { 043 RIGHT_TO_LEFT, LEFT_TO_RIGHT 044 } 045 046 Orientation orientation; 047 048 int start; 049 050 int end; 051 052 int minimumValueLength; 053 054 boolean lowerCase; 055 056 private String name; 057 058 /** 059 * This feature function serves up character n-grams based on StringValued features. For example, 060 * if you wanted trigram suffixes (e.g. 'ion' of 'emotion') for words that are of length 7 or more 061 * you could call the constructor with the following: 062 * CharacterNGramFeatureFunction(Orientation.RIGHT_TO_LEFT, 0, 3, 7, false) 063 * 064 * @param featureName 065 * a user-specified name for the feature function, to be included in all feature names. 066 * @param orientation 067 * must be one of LEFT_TO_RIGHT or RIGHT_TO_LEFT. The orientation determines whether 068 * index 0 corresponds to the first character of the string value or the last. The 069 * orientation does not affect the ordering of the characters in the n-gram which are 070 * always returned in left-to-right order. 071 * @param start 072 * the start of the n-gram (typically 0 for both orientations) 073 * @param end 074 * the end of the n-gram (typically n for both orientations) 075 * @param minimumValueLength 076 * This parameter allows you to skip string values that are too short. It must be greater 077 * than or equal to end. 078 * @param lowerCase 079 * if true than the n-gram used as the feature value will be lowercased. 080 */ 081 public CharacterNgramFeatureFunction( 082 String featureName, 083 Orientation orientation, 084 int start, 085 int end, 086 int minimumValueLength, 087 boolean lowerCase) { 088 name = Feature.createName( 089 "NGram", 090 orientation == Orientation.RIGHT_TO_LEFT ? "Right" : "Left", 091 String.valueOf(start), 092 String.valueOf(end), 093 String.valueOf(minimumValueLength), 094 lowerCase ? "lower" : null, 095 featureName); 096 if (minimumValueLength < end) { 097 throw new IllegalArgumentException( 098 "minimumValueLength must be greater than or equal to the parameter end."); 099 } 100 this.orientation = orientation; 101 this.start = start; 102 this.end = end; 103 this.minimumValueLength = minimumValueLength; 104 this.lowerCase = lowerCase; 105 } 106 107 public CharacterNgramFeatureFunction( 108 Orientation orientation, 109 int start, 110 int end, 111 int minimumValueLength, 112 boolean lowerCase) { 113 this(null, orientation, start, end, minimumValueLength, lowerCase); 114 } 115 116 public CharacterNgramFeatureFunction( 117 String featureName, 118 Orientation orientation, 119 int start, 120 int end) { 121 this(featureName, orientation, start, end, end - start, false); 122 } 123 124 public CharacterNgramFeatureFunction(Orientation orientation, int start, int end) { 125 this(null, orientation, start, end); 126 } 127 128 /** 129 * @return will return an empty list if the value of the feature is not a StringValue or is not as 130 * long as the minimumValueLength. 131 */ 132 @Override 133 public List<Feature> apply(Feature feature) { 134 String featureName = Feature.createName(name, feature.getName()); 135 Object featureValue = feature.getValue(); 136 if (featureValue == null || !(featureValue instanceof String)) 137 return Collections.emptyList(); 138 139 String value = featureValue.toString(); 140 if (value == null || value.length() < minimumValueLength) 141 return Collections.emptyList(); 142 143 String ngram; 144 if (orientation == Orientation.LEFT_TO_RIGHT) { 145 ngram = value.substring(start, end); 146 } else { 147 ngram = value.substring(value.length() - end, value.length() - start); 148 } 149 if (lowerCase) 150 ngram = ngram.toLowerCase(); 151 152 return Collections.singletonList(new Feature(featureName, ngram)); 153 } 154 155}