001/** 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024 025package org.cleartk.ml.feature.transform.extractor; 026 027import java.io.BufferedReader; 028import java.io.BufferedWriter; 029import java.io.File; 030import java.io.FileReader; 031import java.io.FileWriter; 032import java.io.IOException; 033import java.io.Serializable; 034import java.net.URI; 035import java.util.ArrayList; 036import java.util.HashMap; 037import java.util.List; 038import java.util.Locale; 039import java.util.Map; 040 041import org.apache.uima.jcas.JCas; 042import org.apache.uima.jcas.tcas.Annotation; 043import org.cleartk.ml.Feature; 044import org.cleartk.ml.Instance; 045import org.cleartk.ml.feature.extractor.CleartkExtractorException; 046import org.cleartk.ml.feature.extractor.FeatureExtractor1; 047import org.cleartk.ml.feature.transform.OneToOneTrainableExtractor_ImplBase; 048import org.cleartk.ml.feature.transform.TransformableFeature; 049 050/** 051 * Scales features extracted by its subextractor to range 0-1, by scaling by the minimum and maximum 052 * values 053 * <p> 054 * 055 * Copyright (c) 2012, Regents of the University of Colorado <br> 056 * All rights reserved. 057 * 058 * @author Lee Becker 059 * 060 */ 061public class MinMaxNormalizationExtractor<OUTCOME_T, FOCUS_T extends Annotation> extends 062 OneToOneTrainableExtractor_ImplBase<OUTCOME_T> implements FeatureExtractor1<FOCUS_T> { 063 064 private FeatureExtractor1<FOCUS_T> subExtractor; 065 066 private boolean isTrained; 067 068 // This is read in after training for use in transformation 069 private Map<String, MinMaxPair> minMaxMap; 070 071 public MinMaxNormalizationExtractor(String name) { 072 this(name, null); 073 } 074 075 public MinMaxNormalizationExtractor(String name, FeatureExtractor1<FOCUS_T> subExtractor) { 076 super(name); 077 this.subExtractor = subExtractor; 078 this.isTrained = false; 079 } 080 081 @Override 082 protected Feature transform(Feature feature) { 083 String featureName = feature.getName(); 084 MinMaxPair stats = this.minMaxMap.get(featureName); 085 086 double mmn = 0.5d; // this is the default value we will return if we've never seen the feature 087 // before 088 089 double value = ((Number) feature.getValue()).doubleValue(); 090 // this is the typical case 091 if (stats != null && stats.min < stats.max) { 092 mmn = (value - stats.min) / (stats.max - stats.min); 093 } 094 // this is an edge case that could happen when the value is always the same 095 if (stats != null && stats.min == stats.max) { 096 if (value == stats.min) { 097 mmn = 0.5d; 098 } else { 099 mmn = value < stats.min ? 0 : 1; 100 } 101 } 102 mmn = Math.max(0, mmn); // if mmn is negative, then return zero (this would happen if the 103 // feature value was a the smallest value yet seen) 104 mmn = Math.min(1, mmn); // if mmn is more than one, then return 1 (this would happen if the 105 // feature value was the largest value yet seen) 106 return new Feature("MINMAX_NORMED_" + featureName, mmn); 107 } 108 109 @Override 110 public List<Feature> extract(JCas view, FOCUS_T focusAnnotation) throws CleartkExtractorException { 111 112 List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation); 113 List<Feature> result = new ArrayList<Feature>(); 114 if (this.isTrained) { 115 // We have trained / loaded a MinMax model, so now fix up the values 116 for (Feature feature : extracted) { 117 result.add(this.transform(feature)); 118 } 119 } else { 120 // We haven't trained this extractor yet, so just mark the existing features 121 // for future modification, by creating one mega container feature 122 result.add(new TransformableFeature(this.name, extracted)); 123 } 124 125 return result; 126 } 127 128 @Override 129 public void train(Iterable<Instance<OUTCOME_T>> instances) { 130 Map<String, MinMaxRunningStat> featureStatsMap = new HashMap<String, MinMaxRunningStat>(); 131 132 // keep a running mean and standard deviation for all applicable features 133 for (Instance<OUTCOME_T> instance : instances) { 134 // Grab the matching zmus (zero mean, unit stddev) features from the set of all features in an 135 // instance 136 for (Feature feature : instance.getFeatures()) { 137 if (this.isTransformable(feature)) { 138 // ZMUS features contain a list of features, these are actually what get added 139 // to our document frequency map 140 for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) { 141 String featureName = untransformedFeature.getName(); 142 Object featureValue = untransformedFeature.getValue(); 143 if (featureValue instanceof Number) { 144 MinMaxRunningStat stats; 145 if (featureStatsMap.containsKey(featureName)) { 146 stats = featureStatsMap.get(featureName); 147 } else { 148 stats = new MinMaxRunningStat(); 149 featureStatsMap.put(featureName, stats); 150 } 151 stats.add(((Number) featureValue).doubleValue()); 152 } else { 153 throw new IllegalArgumentException("Cannot normalize non-numeric feature values"); 154 } 155 } 156 } 157 } 158 } 159 160 this.minMaxMap = new HashMap<String, MinMaxPair>(); 161 for (Map.Entry<String, MinMaxRunningStat> entry : featureStatsMap.entrySet()) { 162 MinMaxRunningStat stats = entry.getValue(); 163 this.minMaxMap.put(entry.getKey(), new MinMaxPair(stats.min(), stats.max())); 164 } 165 166 this.isTrained = true; 167 } 168 169 @Override 170 public void save(URI zmusDataUri) throws IOException { 171 // Write out tab separated values: feature_name, mean, stddev 172 File out = new File(zmusDataUri); 173 BufferedWriter writer = null; 174 writer = new BufferedWriter(new FileWriter(out)); 175 176 for (Map.Entry<String, MinMaxPair> entry : this.minMaxMap.entrySet()) { 177 MinMaxPair pair = entry.getValue(); 178 writer.append(String.format(Locale.ROOT, "%s\t%f\t%f\n", entry.getKey(), pair.min, pair.max)); 179 } 180 writer.close(); 181 } 182 183 @Override 184 public void load(URI zmusDataUri) throws IOException { 185 // Reads in tab separated values (feature name, min, max) 186 File in = new File(zmusDataUri); 187 BufferedReader reader = null; 188 this.minMaxMap = new HashMap<String, MinMaxPair>(); 189 reader = new BufferedReader(new FileReader(in)); 190 String line = null; 191 while ((line = reader.readLine()) != null) { 192 String[] featureMeanStddev = line.split("\\t"); 193 this.minMaxMap.put( 194 featureMeanStddev[0], 195 new MinMaxPair( 196 Double.parseDouble(featureMeanStddev[1]), 197 Double.parseDouble(featureMeanStddev[2]))); 198 } 199 reader.close(); 200 201 this.isTrained = true; 202 } 203 204 private static class MinMaxPair { 205 206 public MinMaxPair(double min, double max) { 207 this.min = min; 208 this.max = max; 209 } 210 211 public double min; 212 213 public double max; 214 } 215 216 public static class MinMaxRunningStat implements Serializable { 217 218 /** 219 * 220 */ 221 private static final long serialVersionUID = 1L; 222 223 public MinMaxRunningStat() { 224 this.clear(); 225 } 226 227 public void add(double x) { 228 this.n++; 229 230 if (x < min) { 231 this.min = x; 232 } 233 234 if (x > max) { 235 this.max = x; 236 } 237 } 238 239 public void clear() { 240 this.n = 0; 241 this.min = Double.MAX_VALUE; 242 this.max = Double.MIN_VALUE; 243 } 244 245 public int getNumSamples() { 246 return this.n; 247 } 248 249 public double min() { 250 return this.min; 251 } 252 253 public double max() { 254 return this.max; 255 } 256 257 private double min; 258 259 private double max; 260 261 private int n; 262 263 } 264}