001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.viterbi;
025
026import java.lang.reflect.Type;
027import java.util.ArrayList;
028import java.util.Collection;
029import java.util.Collections;
030import java.util.List;
031import java.util.Map;
032
033import org.apache.uima.UimaContext;
034import org.apache.uima.fit.component.initialize.ConfigurationParameterInitializer;
035import org.apache.uima.fit.descriptor.ConfigurationParameter;
036import org.apache.uima.fit.factory.initializable.Initializable;
037import org.apache.uima.resource.ResourceInitializationException;
038import org.cleartk.ml.Classifier;
039import org.cleartk.ml.CleartkProcessingException;
040import org.cleartk.ml.Feature;
041import org.cleartk.ml.SequenceClassifier;
042import org.cleartk.util.CleartkInitializationException;
043import org.cleartk.util.ReflectionUtil;
044import org.cleartk.util.ReflectionUtil.TypeArgumentDelegator;
045
046import com.google.common.base.Functions;
047import com.google.common.base.Objects;
048import com.google.common.base.Objects.ToStringHelper;
049import com.google.common.collect.Lists;
050import com.google.common.collect.Maps;
051import com.google.common.collect.Ordering;
052import com.google.common.primitives.Doubles;
053
054/**
055 * <br>
056 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
057 * All rights reserved.
058 */
059
060public class ViterbiClassifier<OUTCOME_TYPE> implements SequenceClassifier<OUTCOME_TYPE>,
061    Initializable, TypeArgumentDelegator {
062
063  protected Classifier<OUTCOME_TYPE> delegatedClassifier;
064
065  protected OutcomeFeatureExtractor[] outcomeFeatureExtractors;
066
067  public static final String PARAM_STACK_SIZE = "stackSize";
068
069  @ConfigurationParameter(
070      name = PARAM_STACK_SIZE,
071      description = "specifies the maximum number of candidate paths to "
072          + "keep track of. In general, this number should be higher than the number "
073          + "of possible classifications at any given point in the sequence. This "
074          + "guarantees that highest-possible scoring sequence will be returned. If, "
075          + "however, the number of possible classifications is quite high and/or you "
076          + "are concerned about throughput performance, then you may want to reduce the number "
077          + "of candidate paths to maintain.  If Classifier.score is not implemented for the given delegated classifier, then "
078          + "the value of this parameter must be 1. ",
079      defaultValue = "1")
080  protected int stackSize;
081
082  public static final String PARAM_ADD_SCORES = "addScores";
083
084  @ConfigurationParameter(
085      name = PARAM_ADD_SCORES,
086      description = "specifies whether the scores of candidate sequence classifications should be "
087          + "calculated by summing classfication scores for each member of the sequence or by multiplying them. A value of "
088          + "true means that the scores will be summed. A value of false means that the scores will be multiplied. ",
089      defaultValue = "false")
090  protected boolean addScores = false;
091
092  public ViterbiClassifier(
093      Classifier<OUTCOME_TYPE> delegatedClassifier,
094      OutcomeFeatureExtractor[] outcomeFeatureExtractors) {
095    this.delegatedClassifier = delegatedClassifier;
096    this.outcomeFeatureExtractors = outcomeFeatureExtractors;
097  }
098
099  public void initialize(UimaContext context) throws ResourceInitializationException {
100    ConfigurationParameterInitializer.initialize(this, context);
101    if (stackSize < 1) {
102      throw CleartkInitializationException.parameterLessThan(PARAM_STACK_SIZE, 1, stackSize);
103    }
104  }
105
106  public List<OUTCOME_TYPE> classify(List<List<Feature>> features)
107      throws CleartkProcessingException {
108    if (stackSize == 1) {
109      List<Object> outcomes = new ArrayList<Object>();
110      List<OUTCOME_TYPE> returnValues = new ArrayList<OUTCOME_TYPE>();
111      for (List<Feature> instanceFeatures : features) {
112        for (OutcomeFeatureExtractor outcomeFeatureExtractor : outcomeFeatureExtractors) {
113          instanceFeatures.addAll(outcomeFeatureExtractor.extractFeatures(outcomes));
114        }
115        OUTCOME_TYPE outcome = delegatedClassifier.classify(instanceFeatures);
116        outcomes.add(outcome);
117        returnValues.add(outcome);
118      }
119      return returnValues;
120    } else {
121      try {
122        return viterbi(features);
123      } catch (UnsupportedOperationException uoe) {
124        throw CleartkProcessingException.unsupportedOperationSetParameter(
125            uoe,
126            delegatedClassifier,
127            "score",
128            PARAM_STACK_SIZE,
129            1);
130      }
131    }
132
133  }
134
135  /**
136   * This implementation of Viterbi requires at most stackSize * sequenceLength calls to the
137   * classifier. If this proves to be too expensive, then consider using a smaller stack size.
138   * 
139   * @param featureLists
140   *          a sequence-worth of features. Each List<Feature> in features should corresond to all
141   *          of the features for a given element in a sequence to be classified.
142   * @return a list of outcomes (classifications) - one classification for each member of the
143   *         sequence.
144   * @see #PARAM_STACK_SIZE
145   * @see OutcomeFeatureExtractor
146   */
147  public List<OUTCOME_TYPE> viterbi(List<List<Feature>> featureLists)
148      throws CleartkProcessingException {
149
150    if (featureLists == null || featureLists.size() == 0) {
151      return Collections.emptyList();
152    }
153
154    // find the best paths through the outcome lattice
155    Collection<Path> paths = null;
156    for (List<Feature> features : featureLists) {
157
158      // if this is the first instance, start new paths for each outcome
159      if (paths == null) {
160        paths = Lists.newArrayList();
161        Map<OUTCOME_TYPE, Double> scoredOutcomes = this.getScoredOutcomes(features, null);
162        for (OUTCOME_TYPE outcome : this.getTopOutcomes(scoredOutcomes)) {
163          paths.add(new Path(outcome, scoredOutcomes.get(outcome), null));
164        }
165      }
166
167      // for later instances, find the best previous path for each outcome
168      else {
169        Map<OUTCOME_TYPE, Path> maxPaths = Maps.newHashMap();
170        for (Path path : paths) {
171          Map<OUTCOME_TYPE, Double> scoredOutcomes = this.getScoredOutcomes(features, path);
172          for (OUTCOME_TYPE outcome : this.getTopOutcomes(scoredOutcomes)) {
173            double outcomeScore = scoredOutcomes.get(outcome);
174            double score = this.addScores ? path.score + outcomeScore : path.score * outcomeScore;
175            Path maxPath = maxPaths.get(outcome);
176            if (maxPath == null || score > maxPath.score) {
177              maxPaths.put(outcome, new Path(outcome, score, path));
178            }
179          }
180        }
181        paths = maxPaths.values();
182      }
183    }
184
185    // take the maximum of the final paths
186    return Collections.max(paths).outcomes;
187  }
188
189  @Override
190  public List<Map<OUTCOME_TYPE, Double>> score(List<List<Feature>> features)
191      throws CleartkProcessingException {
192    throw new UnsupportedOperationException();
193  }
194
195  public Map<String, Type> getTypeArguments(Class<?> genericType) {
196    if (genericType.equals(SequenceClassifier.class)) {
197      genericType = Classifier.class;
198    }
199    return ReflectionUtil.getTypeArguments(genericType, this.delegatedClassifier);
200  }
201
202  private Map<OUTCOME_TYPE, Double> getScoredOutcomes(List<Feature> features, Path path)
203      throws CleartkProcessingException {
204
205    // add the features from preceding outcomes
206    features = Lists.newArrayList(features);
207    if (path != null) {
208      List<Object> previousOutcomes = new ArrayList<Object>(path.outcomes);
209      for (OutcomeFeatureExtractor outcomeFeatureExtractor : this.outcomeFeatureExtractors) {
210        features.addAll(outcomeFeatureExtractor.extractFeatures(previousOutcomes));
211      }
212    }
213
214    // get the scored outcomes for this instance
215    Map<OUTCOME_TYPE, Double> scoredOutcomes = this.delegatedClassifier.score(features);
216    if (scoredOutcomes.isEmpty()) {
217      throw new IllegalStateException("expected at least one scored outcome, found "
218          + scoredOutcomes);
219    }
220    return scoredOutcomes;
221  }
222
223  private List<OUTCOME_TYPE> getTopOutcomes(Map<OUTCOME_TYPE, Double> scoredOutcomes) {
224    // get just the outcomes that fit within the stack
225    Ordering<OUTCOME_TYPE> ordering = Ordering.natural().onResultOf(
226        Functions.forMap(scoredOutcomes));
227    return ordering.greatestOf(scoredOutcomes.keySet(), this.stackSize);
228  }
229
230  private class Path implements Comparable<Path> {
231    public OUTCOME_TYPE outcome;
232
233    public double score;
234
235    public Path parent;
236
237    public List<OUTCOME_TYPE> outcomes;
238
239    public Path(OUTCOME_TYPE outcome, double score, Path parent) {
240      this.outcome = outcome;
241      this.score = score;
242      this.parent = parent;
243      this.outcomes = Lists.newArrayList();
244      if (this.parent != null) {
245        this.outcomes.addAll(this.parent.outcomes);
246      }
247      this.outcomes.add(this.outcome);
248    }
249
250    @Override
251    public String toString() {
252      ToStringHelper helper = Objects.toStringHelper(this);
253      helper.add("outcome", this.outcome);
254      helper.add("score", this.score);
255      helper.add("parent", this.parent);
256      return helper.toString();
257    }
258
259    @Override
260    public int compareTo(Path that) {
261      return Doubles.compare(this.score, that.score);
262    }
263  }
264}