001/* 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.feature.transform.extractor; 025 026import java.io.BufferedReader; 027import java.io.BufferedWriter; 028import java.io.File; 029import java.io.FileReader; 030import java.io.FileWriter; 031import java.io.IOException; 032import java.net.MalformedURLException; 033import java.net.URI; 034import java.net.URISyntaxException; 035import java.net.URL; 036import java.util.ArrayList; 037import java.util.HashMap; 038import java.util.List; 039import java.util.Locale; 040import java.util.Map; 041 042import org.apache.uima.jcas.JCas; 043import org.apache.uima.jcas.tcas.Annotation; 044import org.cleartk.ml.Feature; 045import org.cleartk.ml.Instance; 046import org.cleartk.ml.feature.extractor.CleartkExtractorException; 047import org.cleartk.ml.feature.extractor.FeatureExtractor1; 048import org.cleartk.ml.feature.transform.TransformableFeature; 049 050/** 051 * 052 * <br> 053 * Copyright (c) 2012, Regents of the University of Colorado <br> 054 * All rights reserved. 055 * 056 * @author Lee Becker 057 */ 058public class CentroidTfidfSimilarityExtractor<OUTCOME_T, FOCUS_T extends Annotation> extends 059 TfidfExtractor<OUTCOME_T, FOCUS_T> { 060 061 private Map<String, Double> centroidMap; 062 063 private SimilarityFunction simFunction; 064 065 private static String docFreqFileSuffix = "_tfidf-centroid-extractor_idfmap.dat"; 066 067 private static String centroidMapFileSuffix = "_tfidf-centroid-extractor_centroidmap.dat"; 068 069 public static URI getDocumentFrequencyDataURI(String name, URI baseURI) 070 throws MalformedURLException, URISyntaxException { 071 return new URL(baseURI.toURL(), name + docFreqFileSuffix).toURI(); 072 } 073 074 public static URI getCentroidDataURI(String name, URI baseURI) throws MalformedURLException, 075 URISyntaxException { 076 return new URL(baseURI.toURL(), name + centroidMapFileSuffix).toURI(); 077 } 078 079 public CentroidTfidfSimilarityExtractor(String name) { 080 super(name); 081 } 082 083 public CentroidTfidfSimilarityExtractor(String name, FeatureExtractor1<FOCUS_T> extractor) { 084 super(name); 085 this.subExtractor = extractor; 086 this.isTrained = false; 087 this.idfMap = new IDFMap(); 088 } 089 090 @Override 091 public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) { 092 List<Feature> features = new ArrayList<Feature>(); 093 List<Feature> featuresToTransform = new ArrayList<Feature>(); 094 for (Feature feature : instance.getFeatures()) { 095 if (this.isTransformable(feature)) { 096 // Store off features for later similarity computation 097 featuresToTransform.addAll(((TransformableFeature) feature).getFeatures()); 098 } else { 099 // pass through non-transformable features 100 features.add(feature); 101 } 102 } 103 104 // Create centroid similarity feature 105 Map<String, Double> featureMap = this.featuresToFeatureMap(featuresToTransform); 106 features.add(new Feature(this.name, new Double(this.simFunction.distance( 107 featureMap, 108 centroidMap)))); 109 110 return new Instance<OUTCOME_T>(instance.getOutcome(), features); 111 } 112 113 public Map<String, Double> featuresToFeatureMap(List<Feature> features) { 114 Map<String, Double> featureMap = new HashMap<String, Double>(); 115 for (Feature feature : features) { 116 String termName = feature.getName(); 117 int tf = (Integer) feature.getValue(); 118 featureMap.put(termName, tf * this.idfMap.getIDF(termName)); 119 } 120 return featureMap; 121 } 122 123 @Override 124 public List<Feature> extract(JCas view, FOCUS_T focusAnnotation) throws CleartkExtractorException { 125 126 List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation); 127 List<Feature> result = new ArrayList<Feature>(); 128 if (this.isTrained) { 129 // We have trained / loaded a centroid tf*idf model, so now compute 130 // a cosine similarity for the extracted values 131 Map<String, Double> extractedFeatureMap = this.featuresToFeatureMap(extracted); 132 result.add(new Feature(name, this.simFunction.distance(extractedFeatureMap, centroidMap))); 133 134 } else { 135 // We haven't trained this extractor yet, so just mark the existing features 136 // for future modification, by creating one mega container feature 137 result.add(new TransformableFeature(this.name, extracted)); 138 } 139 140 return result; 141 } 142 143 protected Map<String, Double> computeCentroid(Iterable<Instance<OUTCOME_T>> instances, IDFMap idfs) { 144 145 // Now compute centroid of all applicable terms (features) in all instances 146 int numDocuments = idfs.getTotalDocumentCount(); 147 Map<String, Double> newCentroidMap = new HashMap<String, Double>(); 148 for (Instance<OUTCOME_T> instance : instances) { 149 150 // Grab the matching tf*idf features from the set of all features in an instance 151 for (Feature feature : instance.getFeatures()) { 152 if (this.isTransformable(feature)) { 153 // tf*idf features contain a list of features, these are actually what get added 154 // to our document frequency map 155 for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) { 156 String termName = untransformedFeature.getName(); 157 int tf = (Integer) untransformedFeature.getValue(); 158 double tfidf = tf * idfs.getIDF(termName); 159 double sumTfidf = (newCentroidMap.containsKey(termName)) 160 ? sumTfidf = newCentroidMap.get(termName) 161 : 0.0; 162 newCentroidMap.put(termName, sumTfidf + tfidf); 163 } 164 } 165 } 166 } 167 168 for (Map.Entry<String, Double> entry : newCentroidMap.entrySet()) { 169 double mean = entry.getValue() / numDocuments; 170 newCentroidMap.put(entry.getKey(), mean); 171 } 172 return newCentroidMap; 173 } 174 175 @Override 176 public void train(Iterable<Instance<OUTCOME_T>> instances) { 177 this.idfMap = this.createIdfMap(instances); 178 this.centroidMap = this.computeCentroid(instances, this.idfMap); 179 this.isTrained = true; 180 this.simFunction = new FixedCosineSimilarity(this.centroidMap); 181 } 182 183 @Override 184 public void save(URI baseURI) throws IOException { 185 URI documentFreqDataURI; 186 URI centroidDataURI; 187 try { 188 documentFreqDataURI = getDocumentFrequencyDataURI(this.name, baseURI); 189 centroidDataURI = getCentroidDataURI(this.name, baseURI); 190 } catch (URISyntaxException e) { 191 throw new IOException(e); 192 } 193 194 // Save off idfMap document frequency data 195 this.idfMap.save(documentFreqDataURI); 196 197 // Save off centroid map data 198 File out = new File(centroidDataURI); 199 BufferedWriter writer = null; 200 writer = new BufferedWriter(new FileWriter(out)); 201 202 for (Map.Entry<String, Double> entry : this.centroidMap.entrySet()) { 203 writer.append(String.format(Locale.ROOT, "%s\t%f\n", entry.getKey(), entry.getValue())); 204 } 205 writer.close(); 206 } 207 208 public void load(URI baseURI) throws IOException { 209 URI documentFreqDataURI; 210 URI centroidDataURI; 211 try { 212 documentFreqDataURI = getDocumentFrequencyDataURI(this.name, baseURI); 213 centroidDataURI = getCentroidDataURI(this.name, baseURI); 214 } catch (URISyntaxException e) { 215 throw new IOException(e); 216 } 217 218 // Load document frequency data 219 this.idfMap.load(documentFreqDataURI); 220 221 // Reads in centroid map as tab separated values (feature name, mean-tfidf value) 222 File in = new File(centroidDataURI); 223 BufferedReader reader = null; 224 this.centroidMap = new HashMap<String, Double>(); 225 reader = new BufferedReader(new FileReader(in)); 226 String line = null; 227 while ((line = reader.readLine()) != null) { 228 String[] featureMeanTfidf = line.split("\\t"); 229 double tfidf = Double.parseDouble(featureMeanTfidf[1]); 230 this.centroidMap.put(featureMeanTfidf[0], tfidf); 231 } 232 reader.close(); 233 234 this.simFunction = new FixedCosineSimilarity(this.centroidMap); 235 this.isTrained = true; 236 } 237 238}