001/* 
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.feature.transform.extractor;
025
026import java.io.BufferedReader;
027import java.io.BufferedWriter;
028import java.io.File;
029import java.io.FileReader;
030import java.io.FileWriter;
031import java.io.IOException;
032import java.net.MalformedURLException;
033import java.net.URI;
034import java.net.URISyntaxException;
035import java.net.URL;
036import java.util.ArrayList;
037import java.util.HashMap;
038import java.util.List;
039import java.util.Locale;
040import java.util.Map;
041
042import org.apache.uima.jcas.JCas;
043import org.apache.uima.jcas.tcas.Annotation;
044import org.cleartk.ml.Feature;
045import org.cleartk.ml.Instance;
046import org.cleartk.ml.feature.extractor.CleartkExtractorException;
047import org.cleartk.ml.feature.extractor.FeatureExtractor1;
048import org.cleartk.ml.feature.transform.TransformableFeature;
049
050/**
051 * 
052 * <br>
053 * Copyright (c) 2012, Regents of the University of Colorado <br>
054 * All rights reserved.
055 * 
056 * @author Lee Becker
057 */
058public class CentroidTfidfSimilarityExtractor<OUTCOME_T, FOCUS_T extends Annotation> extends
059    TfidfExtractor<OUTCOME_T, FOCUS_T> {
060
061  private Map<String, Double> centroidMap;
062
063  private SimilarityFunction simFunction;
064
065  private static String docFreqFileSuffix = "_tfidf-centroid-extractor_idfmap.dat";
066
067  private static String centroidMapFileSuffix = "_tfidf-centroid-extractor_centroidmap.dat";
068
069  public static URI getDocumentFrequencyDataURI(String name, URI baseURI)
070      throws MalformedURLException, URISyntaxException {
071    return new URL(baseURI.toURL(), name + docFreqFileSuffix).toURI();
072  }
073
074  public static URI getCentroidDataURI(String name, URI baseURI) throws MalformedURLException,
075      URISyntaxException {
076    return new URL(baseURI.toURL(), name + centroidMapFileSuffix).toURI();
077  }
078
079  public CentroidTfidfSimilarityExtractor(String name) {
080    super(name);
081  }
082
083  public CentroidTfidfSimilarityExtractor(String name, FeatureExtractor1<FOCUS_T> extractor) {
084    super(name);
085    this.subExtractor = extractor;
086    this.isTrained = false;
087    this.idfMap = new IDFMap();
088  }
089
090  @Override
091  public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
092    List<Feature> features = new ArrayList<Feature>();
093    List<Feature> featuresToTransform = new ArrayList<Feature>();
094    for (Feature feature : instance.getFeatures()) {
095      if (this.isTransformable(feature)) {
096        // Store off features for later similarity computation
097        featuresToTransform.addAll(((TransformableFeature) feature).getFeatures());
098      } else {
099        // pass through non-transformable features
100        features.add(feature);
101      }
102    }
103
104    // Create centroid similarity feature
105    Map<String, Double> featureMap = this.featuresToFeatureMap(featuresToTransform);
106    features.add(new Feature(this.name, new Double(this.simFunction.distance(
107        featureMap,
108        centroidMap))));
109
110    return new Instance<OUTCOME_T>(instance.getOutcome(), features);
111  }
112
113  public Map<String, Double> featuresToFeatureMap(List<Feature> features) {
114    Map<String, Double> featureMap = new HashMap<String, Double>();
115    for (Feature feature : features) {
116      String termName = feature.getName();
117      int tf = (Integer) feature.getValue();
118      featureMap.put(termName, tf * this.idfMap.getIDF(termName));
119    }
120    return featureMap;
121  }
122
123  @Override
124  public List<Feature> extract(JCas view, FOCUS_T focusAnnotation) throws CleartkExtractorException {
125
126    List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
127    List<Feature> result = new ArrayList<Feature>();
128    if (this.isTrained) {
129      // We have trained / loaded a centroid tf*idf model, so now compute
130      // a cosine similarity for the extracted values
131      Map<String, Double> extractedFeatureMap = this.featuresToFeatureMap(extracted);
132      result.add(new Feature(name, this.simFunction.distance(extractedFeatureMap, centroidMap)));
133
134    } else {
135      // We haven't trained this extractor yet, so just mark the existing features
136      // for future modification, by creating one mega container feature
137      result.add(new TransformableFeature(this.name, extracted));
138    }
139
140    return result;
141  }
142
143  protected Map<String, Double> computeCentroid(Iterable<Instance<OUTCOME_T>> instances, IDFMap idfs) {
144
145    // Now compute centroid of all applicable terms (features) in all instances
146    int numDocuments = idfs.getTotalDocumentCount();
147    Map<String, Double> newCentroidMap = new HashMap<String, Double>();
148    for (Instance<OUTCOME_T> instance : instances) {
149
150      // Grab the matching tf*idf features from the set of all features in an instance
151      for (Feature feature : instance.getFeatures()) {
152        if (this.isTransformable(feature)) {
153          // tf*idf features contain a list of features, these are actually what get added
154          // to our document frequency map
155          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
156            String termName = untransformedFeature.getName();
157            int tf = (Integer) untransformedFeature.getValue();
158            double tfidf = tf * idfs.getIDF(termName);
159            double sumTfidf = (newCentroidMap.containsKey(termName))
160                ? sumTfidf = newCentroidMap.get(termName)
161                : 0.0;
162            newCentroidMap.put(termName, sumTfidf + tfidf);
163          }
164        }
165      }
166    }
167
168    for (Map.Entry<String, Double> entry : newCentroidMap.entrySet()) {
169      double mean = entry.getValue() / numDocuments;
170      newCentroidMap.put(entry.getKey(), mean);
171    }
172    return newCentroidMap;
173  }
174
175  @Override
176  public void train(Iterable<Instance<OUTCOME_T>> instances) {
177    this.idfMap = this.createIdfMap(instances);
178    this.centroidMap = this.computeCentroid(instances, this.idfMap);
179    this.isTrained = true;
180    this.simFunction = new FixedCosineSimilarity(this.centroidMap);
181  }
182
183  @Override
184  public void save(URI baseURI) throws IOException {
185    URI documentFreqDataURI;
186    URI centroidDataURI;
187    try {
188      documentFreqDataURI = getDocumentFrequencyDataURI(this.name, baseURI);
189      centroidDataURI = getCentroidDataURI(this.name, baseURI);
190    } catch (URISyntaxException e) {
191      throw new IOException(e);
192    }
193
194    // Save off idfMap document frequency data
195    this.idfMap.save(documentFreqDataURI);
196
197    // Save off centroid map data
198    File out = new File(centroidDataURI);
199    BufferedWriter writer = null;
200    writer = new BufferedWriter(new FileWriter(out));
201
202    for (Map.Entry<String, Double> entry : this.centroidMap.entrySet()) {
203      writer.append(String.format(Locale.ROOT, "%s\t%f\n", entry.getKey(), entry.getValue()));
204    }
205    writer.close();
206  }
207
208  public void load(URI baseURI) throws IOException {
209    URI documentFreqDataURI;
210    URI centroidDataURI;
211    try {
212      documentFreqDataURI = getDocumentFrequencyDataURI(this.name, baseURI);
213      centroidDataURI = getCentroidDataURI(this.name, baseURI);
214    } catch (URISyntaxException e) {
215      throw new IOException(e);
216    }
217
218    // Load document frequency data
219    this.idfMap.load(documentFreqDataURI);
220
221    // Reads in centroid map as tab separated values (feature name, mean-tfidf value)
222    File in = new File(centroidDataURI);
223    BufferedReader reader = null;
224    this.centroidMap = new HashMap<String, Double>();
225    reader = new BufferedReader(new FileReader(in));
226    String line = null;
227    while ((line = reader.readLine()) != null) {
228      String[] featureMeanTfidf = line.split("\\t");
229      double tfidf = Double.parseDouble(featureMeanTfidf[1]);
230      this.centroidMap.put(featureMeanTfidf[0], tfidf);
231    }
232    reader.close();
233
234    this.simFunction = new FixedCosineSimilarity(this.centroidMap);
235    this.isTrained = true;
236  }
237
238}