001/** 
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024
025package org.cleartk.ml.feature.transform.extractor;
026
027import java.io.BufferedReader;
028import java.io.BufferedWriter;
029import java.io.File;
030import java.io.FileReader;
031import java.io.FileWriter;
032import java.io.IOException;
033import java.io.Serializable;
034import java.net.URI;
035import java.util.ArrayList;
036import java.util.HashMap;
037import java.util.List;
038import java.util.Locale;
039import java.util.Map;
040
041import org.apache.uima.jcas.JCas;
042import org.apache.uima.jcas.tcas.Annotation;
043import org.cleartk.ml.Feature;
044import org.cleartk.ml.Instance;
045import org.cleartk.ml.feature.extractor.CleartkExtractorException;
046import org.cleartk.ml.feature.extractor.FeatureExtractor1;
047import org.cleartk.ml.feature.transform.OneToOneTrainableExtractor_ImplBase;
048import org.cleartk.ml.feature.transform.TransformableFeature;
049
050/**
051 * Scales features extracted by its subextractor to range 0-1, by scaling by the minimum and maximum
052 * values
053 * <p>
054 * 
055 * Copyright (c) 2012, Regents of the University of Colorado <br>
056 * All rights reserved.
057 * 
058 * @author Lee Becker
059 * 
060 */
061public class MinMaxNormalizationExtractor<OUTCOME_T, FOCUS_T extends Annotation> extends
062    OneToOneTrainableExtractor_ImplBase<OUTCOME_T> implements FeatureExtractor1<FOCUS_T> {
063
064  private FeatureExtractor1<FOCUS_T> subExtractor;
065
066  private boolean isTrained;
067
068  // This is read in after training for use in transformation
069  private Map<String, MinMaxPair> minMaxMap;
070
071  public MinMaxNormalizationExtractor(String name) {
072    this(name, null);
073  }
074
075  public MinMaxNormalizationExtractor(String name, FeatureExtractor1<FOCUS_T> subExtractor) {
076    super(name);
077    this.subExtractor = subExtractor;
078    this.isTrained = false;
079  }
080
081  @Override
082  protected Feature transform(Feature feature) {
083    String featureName = feature.getName();
084    MinMaxPair stats = this.minMaxMap.get(featureName);
085
086    double mmn = 0.5d; // this is the default value we will return if we've never seen the feature
087                       // before
088
089    double value = ((Number) feature.getValue()).doubleValue();
090    // this is the typical case
091    if (stats != null && stats.min < stats.max) {
092      mmn = (value - stats.min) / (stats.max - stats.min);
093    }
094    // this is an edge case that could happen when the value is always the same
095    if (stats != null && stats.min == stats.max) {
096      if (value == stats.min) {
097        mmn = 0.5d;
098      } else {
099        mmn = value < stats.min ? 0 : 1;
100      }
101    }
102    mmn = Math.max(0, mmn); // if mmn is negative, then return zero (this would happen if the
103                            // feature value was a the smallest value yet seen)
104    mmn = Math.min(1, mmn); // if mmn is more than one, then return 1 (this would happen if the
105                            // feature value was the largest value yet seen)
106    return new Feature("MINMAX_NORMED_" + featureName, mmn);
107  }
108
109  @Override
110  public List<Feature> extract(JCas view, FOCUS_T focusAnnotation) throws CleartkExtractorException {
111
112    List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
113    List<Feature> result = new ArrayList<Feature>();
114    if (this.isTrained) {
115      // We have trained / loaded a MinMax model, so now fix up the values
116      for (Feature feature : extracted) {
117        result.add(this.transform(feature));
118      }
119    } else {
120      // We haven't trained this extractor yet, so just mark the existing features
121      // for future modification, by creating one mega container feature
122      result.add(new TransformableFeature(this.name, extracted));
123    }
124
125    return result;
126  }
127
128  @Override
129  public void train(Iterable<Instance<OUTCOME_T>> instances) {
130    Map<String, MinMaxRunningStat> featureStatsMap = new HashMap<String, MinMaxRunningStat>();
131
132    // keep a running mean and standard deviation for all applicable features
133    for (Instance<OUTCOME_T> instance : instances) {
134      // Grab the matching zmus (zero mean, unit stddev) features from the set of all features in an
135      // instance
136      for (Feature feature : instance.getFeatures()) {
137        if (this.isTransformable(feature)) {
138          // ZMUS features contain a list of features, these are actually what get added
139          // to our document frequency map
140          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
141            String featureName = untransformedFeature.getName();
142            Object featureValue = untransformedFeature.getValue();
143            if (featureValue instanceof Number) {
144              MinMaxRunningStat stats;
145              if (featureStatsMap.containsKey(featureName)) {
146                stats = featureStatsMap.get(featureName);
147              } else {
148                stats = new MinMaxRunningStat();
149                featureStatsMap.put(featureName, stats);
150              }
151              stats.add(((Number) featureValue).doubleValue());
152            } else {
153              throw new IllegalArgumentException("Cannot normalize non-numeric feature values");
154            }
155          }
156        }
157      }
158    }
159
160    this.minMaxMap = new HashMap<String, MinMaxPair>();
161    for (Map.Entry<String, MinMaxRunningStat> entry : featureStatsMap.entrySet()) {
162      MinMaxRunningStat stats = entry.getValue();
163      this.minMaxMap.put(entry.getKey(), new MinMaxPair(stats.min(), stats.max()));
164    }
165
166    this.isTrained = true;
167  }
168
169  @Override
170  public void save(URI zmusDataUri) throws IOException {
171    // Write out tab separated values: feature_name, mean, stddev
172    File out = new File(zmusDataUri);
173    BufferedWriter writer = null;
174    writer = new BufferedWriter(new FileWriter(out));
175
176    for (Map.Entry<String, MinMaxPair> entry : this.minMaxMap.entrySet()) {
177      MinMaxPair pair = entry.getValue();
178      writer.append(String.format(Locale.ROOT, "%s\t%f\t%f\n", entry.getKey(), pair.min, pair.max));
179    }
180    writer.close();
181  }
182
183  @Override
184  public void load(URI zmusDataUri) throws IOException {
185    // Reads in tab separated values (feature name, min, max)
186    File in = new File(zmusDataUri);
187    BufferedReader reader = null;
188    this.minMaxMap = new HashMap<String, MinMaxPair>();
189    reader = new BufferedReader(new FileReader(in));
190    String line = null;
191    while ((line = reader.readLine()) != null) {
192      String[] featureMeanStddev = line.split("\\t");
193      this.minMaxMap.put(
194          featureMeanStddev[0],
195          new MinMaxPair(
196              Double.parseDouble(featureMeanStddev[1]),
197              Double.parseDouble(featureMeanStddev[2])));
198    }
199    reader.close();
200
201    this.isTrained = true;
202  }
203
204  private static class MinMaxPair {
205
206    public MinMaxPair(double min, double max) {
207      this.min = min;
208      this.max = max;
209    }
210
211    public double min;
212
213    public double max;
214  }
215
216  public static class MinMaxRunningStat implements Serializable {
217
218    /**
219     * 
220     */
221    private static final long serialVersionUID = 1L;
222
223    public MinMaxRunningStat() {
224      this.clear();
225    }
226
227    public void add(double x) {
228      this.n++;
229
230      if (x < min) {
231        this.min = x;
232      }
233
234      if (x > max) {
235        this.max = x;
236      }
237    }
238
239    public void clear() {
240      this.n = 0;
241      this.min = Double.MAX_VALUE;
242      this.max = Double.MIN_VALUE;
243    }
244
245    public int getNumSamples() {
246      return this.n;
247    }
248
249    public double min() {
250      return this.min;
251    }
252
253    public double max() {
254      return this.max;
255    }
256
257    private double min;
258
259    private double max;
260
261    private int n;
262
263  }
264}