001/* 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.chunking; 025 026import java.lang.reflect.Constructor; 027import java.lang.reflect.InvocationTargetException; 028import java.util.ArrayList; 029import java.util.List; 030import java.util.Map; 031 032import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 033import org.apache.uima.cas.Feature; 034import org.apache.uima.jcas.JCas; 035import org.apache.uima.jcas.tcas.Annotation; 036 037/** 038 * Base class for classes that assemble individual classifier outcomes on smaller annotations 039 * ("sub-chunks") to form larger annotations ("chunks"). 040 * 041 * <br> 042 * Copyright (c) 2012, Regents of the University of Colorado <br> 043 * All rights reserved. 044 * 045 * @author Steven Bethard 046 */ 047public abstract class Chunking_ImplBase<SUB_CHUNK_TYPE extends Annotation, CHUNK_TYPE extends Annotation> 048 implements Chunking<String, SUB_CHUNK_TYPE, CHUNK_TYPE> { 049 050 protected Class<? extends CHUNK_TYPE> chunkClass; 051 052 protected Class<? extends SUB_CHUNK_TYPE> subChunkClass; 053 054 protected String featureFullName; 055 056 public Chunking_ImplBase( 057 Class<? extends SUB_CHUNK_TYPE> subChunkClass, 058 Class<? extends CHUNK_TYPE> chunkClass, 059 String featureName) { 060 this.subChunkClass = subChunkClass; 061 this.chunkClass = chunkClass; 062 this.featureFullName = featureName == null ? null : chunkClass.getCanonicalName() + ":" 063 + featureName; 064 } 065 066 protected Feature getFeature(JCas jCas) { 067 String name = this.featureFullName; 068 return name == null ? null : jCas.getTypeSystem().getFeatureByFullName(name); 069 } 070 071 protected String getOutcomeSuffix(CHUNK_TYPE chunk, Feature feature) { 072 return feature == null ? "" : "-" + chunk.getFeatureValueAsString(feature); 073 } 074 075 /** 076 * Produce a map from sub-chunk annotations to their outcome prefixes (e.g. 'I', 'O'). 077 * 078 * If any sub-chunk annotations are not included in the map, they will be given prefix 'O'. 079 * 080 * @param subChunks 081 * The sub-annotations that make up the chunks. 082 * @param chunks 083 * The chunk annotations. 084 * @return A mapping from chunk sub-chunk annotations to outcome prefixes. 085 */ 086 protected abstract Map<SUB_CHUNK_TYPE, String> getSubChunkToOutcomeMap( 087 JCas jCas, 088 List<SUB_CHUNK_TYPE> subChunks, 089 List<CHUNK_TYPE> chunks); 090 091 @Override 092 public List<String> createOutcomes( 093 JCas jCas, 094 List<SUB_CHUNK_TYPE> subChunks, 095 List<CHUNK_TYPE> chunks) throws AnalysisEngineProcessException { 096 097 // get the mapping from sub-chunks to their outcomes 098 Map<SUB_CHUNK_TYPE, String> subChunkToOutcome; 099 subChunkToOutcome = this.getSubChunkToOutcomeMap(jCas, subChunks, chunks); 100 101 // create one outcome for each sub-chunk by combining the prefix and feature value 102 List<String> outcomes = new ArrayList<String>(); 103 for (SUB_CHUNK_TYPE subChunk : subChunks) { 104 String outcome = subChunkToOutcome.get(subChunk); 105 if (outcome == null) { 106 outcome = "O"; 107 } 108 outcomes.add(outcome); 109 } 110 return outcomes; 111 } 112 113 /** 114 * Determines whether the current outcome represents the end of a chunk. 115 * 116 * Both the current outcome and the following outcome are provided for making this decision. 117 * 118 * @param currPrefix 119 * The prefix of the current outcome 120 * @param currLabel 121 * The label of the current outcome 122 * @param nextPrefix 123 * The prefix of the following outcome 124 * @param nextLabel 125 * The label of the following outcome 126 * @return True if the current outcome represents the end of a chunk 127 */ 128 protected abstract boolean isEndOfChunk( 129 char currPrefix, 130 String currLabel, 131 char nextPrefix, 132 String nextLabel); 133 134 @Override 135 public List<CHUNK_TYPE> createChunks( 136 JCas jCas, 137 List<SUB_CHUNK_TYPE> subChunks, 138 List<String> outcomes) throws AnalysisEngineProcessException { 139 140 // validate parameters 141 int nSubChunks = subChunks.size(); 142 int nOutcomes = outcomes.size(); 143 if (nSubChunks != nOutcomes) { 144 String message = "expected the same number of sub-chunks (%d) as outcome s(%d)"; 145 throw new IllegalArgumentException(String.format(message, nSubChunks, nOutcomes)); 146 } 147 148 // get the Feature object if we need to assign an attribute 149 Feature feature; 150 if (this.featureFullName == null) { 151 feature = null; 152 } else { 153 feature = jCas.getTypeSystem().getFeatureByFullName(this.featureFullName); 154 } 155 156 // parse outcomes, and add a final Outside outcome for ease of parsing 157 List<ChunkOutcome> chunkOutcomes = new ArrayList<ChunkOutcome>(); 158 for (String outcome : outcomes) { 159 chunkOutcomes.add(new ChunkOutcome(outcome)); 160 } 161 chunkOutcomes.add(new ChunkOutcome("O")); 162 163 // create chunk annotations as appropriate for the outcomes 164 List<CHUNK_TYPE> chunks = new ArrayList<CHUNK_TYPE>(); 165 for (int i = 0; i < outcomes.size(); ++i) { 166 ChunkOutcome outcome = chunkOutcomes.get(i); 167 168 // if we're at the beginning of a chunk, gather outcomes until we hit the end of the chunk 169 // (a chunk ends when we hit 'O' or when the label change, e.g. I-PER I-ORG) 170 if (outcome.prefix != 'O') { 171 172 // advance to the end of this chunk 173 int begin = i; 174 int end = i; 175 while (true) { 176 ChunkOutcome curr = chunkOutcomes.get(end); 177 ChunkOutcome next = chunkOutcomes.get(end + 1); 178 if (this.isEndOfChunk(curr.prefix, curr.label, next.prefix, next.label)) { 179 break; 180 } 181 ++end; 182 } 183 184 // skip over all the outcomes we just consumed 185 i = end; 186 187 // convert the outcome indexes into CAS offsets 188 begin = subChunks.get(begin).getBegin(); 189 end = subChunks.get(end).getEnd(); 190 191 // construct the chunk annotation 192 Constructor<? extends CHUNK_TYPE> constructor; 193 try { 194 constructor = this.chunkClass.getConstructor(JCas.class, int.class, int.class); 195 } catch (NoSuchMethodException e) { 196 throw new AnalysisEngineProcessException(e); 197 } 198 CHUNK_TYPE chunk; 199 try { 200 chunk = constructor.newInstance(jCas, begin, end); 201 } catch (InstantiationException e) { 202 throw new AnalysisEngineProcessException(e); 203 } catch (IllegalAccessException e) { 204 throw new AnalysisEngineProcessException(e); 205 } catch (InvocationTargetException e) { 206 throw new AnalysisEngineProcessException(e); 207 } 208 209 // set the annotation feature if necessary 210 if (this.featureFullName != null) { 211 chunk.setFeatureValueFromString(feature, outcome.label); 212 } 213 214 // add the chunk to the CAS and to the result list 215 chunk.addToIndexes(); 216 chunks.add(chunk); 217 } 218 } 219 return chunks; 220 } 221 222 private static class ChunkOutcome { 223 public char prefix; 224 225 public String label; 226 227 public ChunkOutcome(String outcome) { 228 this.prefix = outcome.charAt(0); 229 this.label = outcome.length() < 2 ? "" : outcome.substring(2); 230 } 231 } 232}