001/** 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml.multi; 025 026import java.io.File; 027import java.io.IOException; 028import java.util.HashMap; 029import java.util.Map; 030 031import org.apache.uima.UimaContext; 032import org.apache.uima.UimaContextAdmin; 033import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 034import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 035import org.apache.uima.fit.descriptor.ConfigurationParameter; 036import org.apache.uima.fit.factory.initializable.Initializable; 037import org.apache.uima.fit.factory.initializable.InitializableFactory; 038import org.apache.uima.resource.ConfigurationManager; 039import org.apache.uima.resource.ResourceInitializationException; 040import org.cleartk.ml.Classifier; 041import org.cleartk.ml.ClassifierFactory; 042import org.cleartk.ml.CleartkAnnotator; 043import org.cleartk.ml.CleartkProcessingException; 044import org.cleartk.ml.DataWriter; 045import org.cleartk.ml.DataWriterFactory; 046import org.cleartk.ml.jar.DirectoryDataWriterFactory; 047import org.cleartk.ml.jar.GenericJarClassifierFactory; 048import org.cleartk.util.CleartkInitializationException; 049import org.cleartk.util.ReflectionUtil; 050 051/** 052 * <br> 053 * Copyright (c) 2011, Regents of the University of Colorado <br> 054 * All rights reserved. 055 * 056 * @author Lee Becker 057 * 058 * This class provides a framework for handling multiple classifiers within a single 059 * analysis engine. 060 * <p> 061 * 062 * Use cases that may lend themselves to {@link CleartkMultiAnnotator} over 063 * {@link CleartkAnnotator} include: 064 * <ul> 065 * <li>Explicit control for one-vs-all multi-label classification 066 * <li>Predicating training/classification on specific conditions. (e.g. if the the word to 067 * the left is a verb, use the verb classifier, else use the default classifier) 068 * <li>Voting or ensemble classification 069 * </ul> 070 */ 071public abstract class CleartkMultiAnnotator<OUTCOME_TYPE> extends JCasAnnotator_ImplBase implements 072 Initializable { 073 074 public static final String PARAM_CLASSIFIER_FACTORY_CLASS_NAME = "classifierFactoryClassName"; 075 076 private static final String DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME = "org.cleartk.ml.jar.JarClassifierFactory"; 077 078 @ConfigurationParameter( 079 name = PARAM_CLASSIFIER_FACTORY_CLASS_NAME, 080 mandatory = false, 081 description = "provides the full name of the ClassifierFactory class to be used.", 082 defaultValue = DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME) 083 private String classifierFactoryClassName; 084 085 public static final String PARAM_DATA_WRITER_FACTORY_CLASS_NAME = "dataWriterFactoryClassName"; 086 087 private static final String DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME = "org.cleartk.ml.jar.DefaultDataWriterFactory"; 088 089 @ConfigurationParameter( 090 name = PARAM_DATA_WRITER_FACTORY_CLASS_NAME, 091 mandatory = false, 092 description = "provides the full name of the DataWriterFactory class to be used.", 093 defaultValue = DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME) 094 private String dataWriterFactoryClassName; 095 096 public static final String PARAM_IS_TRAINING = "isTraining"; 097 098 @ConfigurationParameter( 099 name = PARAM_IS_TRAINING, 100 mandatory = false, 101 description = "determines whether this annotator is writing training data or using a classifier to annotate. Normally inferred automatically based on whether or not a DataWriterFactory class has been set.") 102 private Boolean isTraining; 103 104 private boolean primitiveIsTraining; 105 106 protected ClassifierFactory<?> classifierFactory; 107 108 protected DataWriterFactory<?> dataWriterFactory; 109 110 protected Map<String, Classifier<OUTCOME_TYPE>> classifiers; 111 112 protected Map<String, DataWriter<OUTCOME_TYPE>> dataWriters; 113 114 private UimaContext uimaContext; 115 116 protected File outputDirectoryRoot; 117 118 protected File classifierJarPathRoot; 119 120 @Override 121 public void initialize(UimaContext context) throws ResourceInitializationException { 122 super.initialize(context); 123 124 if (this.dataWriterFactoryClassName == null && this.classifierFactoryClassName == null) { 125 CleartkInitializationException.neitherParameterSet( 126 PARAM_DATA_WRITER_FACTORY_CLASS_NAME, 127 this.dataWriterFactoryClassName, 128 PARAM_CLASSIFIER_FACTORY_CLASS_NAME, 129 this.classifierFactoryClassName); 130 } 131 132 // determine whether we start out as training or predicting 133 if (this.isTraining != null) { 134 this.primitiveIsTraining = this.isTraining; 135 } else if (!DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME.equals(this.dataWriterFactoryClassName)) { 136 this.primitiveIsTraining = true; 137 } else if (context.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY) != null) { 138 this.primitiveIsTraining = true; 139 } else if (!DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME.equals(this.classifierFactoryClassName)) { 140 this.primitiveIsTraining = false; 141 } else if (context.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH) != null) { 142 this.primitiveIsTraining = false; 143 } else { 144 String message = "Please specify PARAM_IS_TRAINING - unable to infer it from context"; 145 throw new IllegalArgumentException(message); 146 } 147 148 this.uimaContext = context; 149 UimaContextAdmin contextAdmin = (UimaContextAdmin) this.uimaContext; 150 ConfigurationManager manager = contextAdmin.getConfigurationManager(); 151 152 if (this.isTraining()) { 153 // Create the data writer factory and initialize a Map to hold the data writers 154 // Individual data writers will be created dynamically with the getDataWriter method 155 this.dataWriters = new HashMap<String, DataWriter<OUTCOME_TYPE>>(); 156 this.outputDirectoryRoot = (File) manager.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY); 157 this.dataWriterFactory = InitializableFactory.create( 158 context, 159 dataWriterFactoryClassName, 160 DataWriterFactory.class); 161 } else { 162 163 // Create the classifier factory and initialize a map to hold the classifiers 164 // Individual classifiers will be created dynamically with the getClassifier method 165 this.classifiers = new HashMap<String, Classifier<OUTCOME_TYPE>>(); 166 this.classifierJarPathRoot = (File) manager.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH); 167 168 this.classifierFactory = InitializableFactory.create( 169 context, 170 classifierFactoryClassName, 171 ClassifierFactory.class); 172 } 173 } 174 175 @Override 176 public void collectionProcessComplete() throws AnalysisEngineProcessException { 177 super.collectionProcessComplete(); 178 if (this.isTraining()) { 179 try { 180 for (DataWriter<OUTCOME_TYPE> dataWriter : dataWriters.values()) { 181 dataWriter.finish(); 182 } 183 } catch (CleartkProcessingException ctke) { 184 throw new AnalysisEngineProcessException(ctke); 185 } 186 } 187 } 188 189 protected boolean isTraining() { 190 return this.primitiveIsTraining; 191 } 192 193 /** 194 * Gets the classifier associated with name. If it does not exist, this method will use the 195 * {@link ClassifierFactory} specified at initialization to create a new one. 196 * 197 * @param name 198 * The name of the classifier 199 * @return The classifier associated with the name 200 */ 201 protected Classifier<OUTCOME_TYPE> getClassifier(String name) 202 throws ResourceInitializationException { 203 if (classifiers.containsKey(name)) { 204 return classifiers.get(name); 205 } 206 207 File classifierJarPath = new File(this.classifierJarPathRoot, name); 208 UimaContextAdmin contextAdmin = (UimaContextAdmin) this.uimaContext; 209 ConfigurationManager manager = contextAdmin.getConfigurationManager(); 210 manager.setConfigParameterValue(contextAdmin.getQualifiedContextName() 211 + GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, classifierJarPath.getPath()); 212 213 // create the factory and instantiate the classifier 214 ClassifierFactory<?> factory = InitializableFactory.create( 215 uimaContext, 216 classifierFactoryClassName, 217 ClassifierFactory.class); 218 Classifier<?> untypedClassifier; 219 try { 220 untypedClassifier = factory.createClassifier(); 221 } catch (IOException e) { 222 throw new ResourceInitializationException(e); 223 } 224 225 Classifier<OUTCOME_TYPE> classifier = ReflectionUtil.uncheckedCast(untypedClassifier); 226 ReflectionUtil.checkTypeParameterIsAssignable( 227 CleartkMultiAnnotator.class, 228 "OUTCOME_TYPE", 229 this, 230 Classifier.class, 231 "OUTCOME_TYPE", 232 classifier); 233 InitializableFactory.initialize(untypedClassifier, this.getContext()); 234 this.classifiers.put(name, classifier); 235 return classifier; 236 } 237 238 /** 239 * Gets the {@link DataWriter} associated with name. If it does not exist, this method will use 240 * the {@link DataWriterFactory} specified during initialization to create a dataWriter associated 241 * with the name parameter. 242 * 243 * @param name 244 * The name of the {@link DataWriter} 245 * @return The {@link DataWriter} associated with the name 246 */ 247 protected DataWriter<OUTCOME_TYPE> getDataWriter(String name) 248 throws ResourceInitializationException { 249 if (dataWriters.containsKey(name)) { 250 return dataWriters.get(name); 251 } 252 253 DataWriter<?> untypedDataWriter; 254 File dataWriterPath = new File(this.outputDirectoryRoot, name); 255 UimaContextAdmin contextAdmin = (UimaContextAdmin) this.uimaContext; 256 ConfigurationManager manager = contextAdmin.getConfigurationManager(); 257 manager.setConfigParameterValue(contextAdmin.getQualifiedContextName() 258 + DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, dataWriterPath); 259 260 try { 261 untypedDataWriter = this.dataWriterFactory.createDataWriter(); 262 } catch (IOException e) { 263 throw new ResourceInitializationException(e); 264 } 265 266 InitializableFactory.initialize(untypedDataWriter, uimaContext); 267 DataWriter<OUTCOME_TYPE> dataWriter = ReflectionUtil.uncheckedCast(untypedDataWriter); 268 this.dataWriters.put(name, dataWriter); 269 return dataWriter; 270 } 271 272}