001/* 002 * Copyright (c) 2013, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.liblinear.encoder; 025 026import java.io.File; 027import java.io.IOException; 028import java.util.Map; 029 030import org.cleartk.ml.Feature; 031import org.cleartk.ml.encoder.CleartkEncoderException; 032import org.cleartk.ml.encoder.features.FeaturesEncoder; 033 034import com.google.common.collect.Maps; 035 036import de.bwaldvogel.liblinear.FeatureNode; 037 038/** 039 * <br> 040 * Copyright (c) 2013, Regents of the University of Colorado <br> 041 * All rights reserved. 042 * 043 * @author Steven Bethard 044 */ 045public class FeatureNodeArrayEncoder implements FeaturesEncoder<FeatureNode[]> { 046 private static final long serialVersionUID = 1L; 047 048 private static final String BIAS_NAME = FeatureNodeArrayEncoder.class.getName() + ".BIAS"; 049 050 private Map<String, Integer> stringToInt; 051 052 private int biasIndex; 053 054 private boolean isFinalized; 055 056 public FeatureNodeArrayEncoder() { 057 this.stringToInt = Maps.newHashMap(); 058 this.biasIndex = 1; 059 this.isFinalized = false; 060 this.stringToInt.put(BIAS_NAME, biasIndex); 061 } 062 063 @Override 064 public FeatureNode[] encodeAll(Iterable<Feature> features) throws CleartkEncoderException { 065 // map feature indexes to feature nodes, sorting by index 066 Map<Integer, FeatureNode> featureNodes = Maps.newTreeMap(); 067 068 // add a "bias" feature node; otherwise LIBLINEAR is unable to predict the majority class for 069 // instances consisting entirely of features never seen during training 070 featureNodes.put(this.biasIndex, new FeatureNode(this.biasIndex, 1)); 071 072 // add nodes for all the features 073 for (Feature feature : features) { 074 075 // convert features to a String name and a double value 076 String name; 077 double value; 078 if (feature.getValue() instanceof Number) { 079 name = feature.getName(); 080 value = ((Number) feature.getValue()).doubleValue(); 081 } else { 082 name = Feature.createName(feature.getName(), feature.getValue().toString()); 083 value = 1.0; 084 } 085 086 // convert the name String to an index 087 if (!this.stringToInt.containsKey(name)) { 088 if (!this.isFinalized) { 089 this.stringToInt.put(name, this.stringToInt.size() + 1); 090 } 091 092 // don't create feature nodes for features not seen before finalization 093 else { 094 continue; 095 } 096 } 097 int index = this.stringToInt.get(name); 098 099 // create a feature node for the given index 100 // NOTE: if there are duplicate features, only the last will be kept 101 featureNodes.put(index, new FeatureNode(index, value)); 102 } 103 104 // put the feature nodes into an array, sorted by feature index 105 FeatureNode[] featureNodeArray = new FeatureNode[featureNodes.size()]; 106 int i = 0; 107 for (Integer index : featureNodes.keySet()) { 108 featureNodeArray[i] = featureNodes.get(index); 109 ++i; 110 } 111 return featureNodeArray; 112 } 113 114 @Override 115 public void finalizeFeatureSet(File outputDirectory) throws IOException { 116 this.isFinalized = true; 117 } 118}