001/* 
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.chunking;
025
026import java.lang.reflect.Constructor;
027import java.lang.reflect.InvocationTargetException;
028import java.util.ArrayList;
029import java.util.List;
030import java.util.Map;
031
032import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
033import org.apache.uima.cas.Feature;
034import org.apache.uima.jcas.JCas;
035import org.apache.uima.jcas.tcas.Annotation;
036
037/**
038 * Base class for classes that assemble individual classifier outcomes on smaller annotations
039 * ("sub-chunks") to form larger annotations ("chunks").
040 * 
041 * <br>
042 * Copyright (c) 2012, Regents of the University of Colorado <br>
043 * All rights reserved.
044 * 
045 * @author Steven Bethard
046 */
047public abstract class Chunking_ImplBase<SUB_CHUNK_TYPE extends Annotation, CHUNK_TYPE extends Annotation>
048    implements Chunking<String, SUB_CHUNK_TYPE, CHUNK_TYPE> {
049
050  protected Class<? extends CHUNK_TYPE> chunkClass;
051
052  protected Class<? extends SUB_CHUNK_TYPE> subChunkClass;
053
054  protected String featureFullName;
055
056  public Chunking_ImplBase(
057      Class<? extends SUB_CHUNK_TYPE> subChunkClass,
058      Class<? extends CHUNK_TYPE> chunkClass,
059      String featureName) {
060    this.subChunkClass = subChunkClass;
061    this.chunkClass = chunkClass;
062    this.featureFullName = featureName == null ? null : chunkClass.getCanonicalName() + ":"
063        + featureName;
064  }
065
066  protected Feature getFeature(JCas jCas) {
067    String name = this.featureFullName;
068    return name == null ? null : jCas.getTypeSystem().getFeatureByFullName(name);
069  }
070
071  protected String getOutcomeSuffix(CHUNK_TYPE chunk, Feature feature) {
072    return feature == null ? "" : "-" + chunk.getFeatureValueAsString(feature);
073  }
074
075  /**
076   * Produce a map from sub-chunk annotations to their outcome prefixes (e.g. 'I', 'O').
077   * 
078   * If any sub-chunk annotations are not included in the map, they will be given prefix 'O'.
079   * 
080   * @param subChunks
081   *          The sub-annotations that make up the chunks.
082   * @param chunks
083   *          The chunk annotations.
084   * @return A mapping from chunk sub-chunk annotations to outcome prefixes.
085   */
086  protected abstract Map<SUB_CHUNK_TYPE, String> getSubChunkToOutcomeMap(
087      JCas jCas,
088      List<SUB_CHUNK_TYPE> subChunks,
089      List<CHUNK_TYPE> chunks);
090
091  @Override
092  public List<String> createOutcomes(
093      JCas jCas,
094      List<SUB_CHUNK_TYPE> subChunks,
095      List<CHUNK_TYPE> chunks) throws AnalysisEngineProcessException {
096
097    // get the mapping from sub-chunks to their outcomes
098    Map<SUB_CHUNK_TYPE, String> subChunkToOutcome;
099    subChunkToOutcome = this.getSubChunkToOutcomeMap(jCas, subChunks, chunks);
100
101    // create one outcome for each sub-chunk by combining the prefix and feature value
102    List<String> outcomes = new ArrayList<String>();
103    for (SUB_CHUNK_TYPE subChunk : subChunks) {
104      String outcome = subChunkToOutcome.get(subChunk);
105      if (outcome == null) {
106        outcome = "O";
107      }
108      outcomes.add(outcome);
109    }
110    return outcomes;
111  }
112
113  /**
114   * Determines whether the current outcome represents the end of a chunk.
115   * 
116   * Both the current outcome and the following outcome are provided for making this decision.
117   * 
118   * @param currPrefix
119   *          The prefix of the current outcome
120   * @param currLabel
121   *          The label of the current outcome
122   * @param nextPrefix
123   *          The prefix of the following outcome
124   * @param nextLabel
125   *          The label of the following outcome
126   * @return True if the current outcome represents the end of a chunk
127   */
128  protected abstract boolean isEndOfChunk(
129      char currPrefix,
130      String currLabel,
131      char nextPrefix,
132      String nextLabel);
133
134  @Override
135  public List<CHUNK_TYPE> createChunks(
136      JCas jCas,
137      List<SUB_CHUNK_TYPE> subChunks,
138      List<String> outcomes) throws AnalysisEngineProcessException {
139
140    // validate parameters
141    int nSubChunks = subChunks.size();
142    int nOutcomes = outcomes.size();
143    if (nSubChunks != nOutcomes) {
144      String message = "expected the same number of sub-chunks (%d) as outcome s(%d)";
145      throw new IllegalArgumentException(String.format(message, nSubChunks, nOutcomes));
146    }
147
148    // get the Feature object if we need to assign an attribute
149    Feature feature;
150    if (this.featureFullName == null) {
151      feature = null;
152    } else {
153      feature = jCas.getTypeSystem().getFeatureByFullName(this.featureFullName);
154    }
155
156    // parse outcomes, and add a final Outside outcome for ease of parsing
157    List<ChunkOutcome> chunkOutcomes = new ArrayList<ChunkOutcome>();
158    for (String outcome : outcomes) {
159      chunkOutcomes.add(new ChunkOutcome(outcome));
160    }
161    chunkOutcomes.add(new ChunkOutcome("O"));
162
163    // create chunk annotations as appropriate for the outcomes
164    List<CHUNK_TYPE> chunks = new ArrayList<CHUNK_TYPE>();
165    for (int i = 0; i < outcomes.size(); ++i) {
166      ChunkOutcome outcome = chunkOutcomes.get(i);
167
168      // if we're at the beginning of a chunk, gather outcomes until we hit the end of the chunk
169      // (a chunk ends when we hit 'O' or when the label change, e.g. I-PER I-ORG)
170      if (outcome.prefix != 'O') {
171
172        // advance to the end of this chunk
173        int begin = i;
174        int end = i;
175        while (true) {
176          ChunkOutcome curr = chunkOutcomes.get(end);
177          ChunkOutcome next = chunkOutcomes.get(end + 1);
178          if (this.isEndOfChunk(curr.prefix, curr.label, next.prefix, next.label)) {
179            break;
180          }
181          ++end;
182        }
183
184        // skip over all the outcomes we just consumed
185        i = end;
186
187        // convert the outcome indexes into CAS offsets
188        begin = subChunks.get(begin).getBegin();
189        end = subChunks.get(end).getEnd();
190
191        // construct the chunk annotation
192        Constructor<? extends CHUNK_TYPE> constructor;
193        try {
194          constructor = this.chunkClass.getConstructor(JCas.class, int.class, int.class);
195        } catch (NoSuchMethodException e) {
196          throw new AnalysisEngineProcessException(e);
197        }
198        CHUNK_TYPE chunk;
199        try {
200          chunk = constructor.newInstance(jCas, begin, end);
201        } catch (InstantiationException e) {
202          throw new AnalysisEngineProcessException(e);
203        } catch (IllegalAccessException e) {
204          throw new AnalysisEngineProcessException(e);
205        } catch (InvocationTargetException e) {
206          throw new AnalysisEngineProcessException(e);
207        }
208
209        // set the annotation feature if necessary
210        if (this.featureFullName != null) {
211          chunk.setFeatureValueFromString(feature, outcome.label);
212        }
213
214        // add the chunk to the CAS and to the result list
215        chunk.addToIndexes();
216        chunks.add(chunk);
217      }
218    }
219    return chunks;
220  }
221
222  private static class ChunkOutcome {
223    public char prefix;
224
225    public String label;
226
227    public ChunkOutcome(String outcome) {
228      this.prefix = outcome.charAt(0);
229      this.label = outcome.length() < 2 ? "" : outcome.substring(2);
230    }
231  }
232}