001/** 002 * Copyright (c) 2009-2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.ml; 025 026import java.io.IOException; 027import java.util.ArrayList; 028import java.util.List; 029 030import org.apache.uima.UimaContext; 031import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 032import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 033import org.apache.uima.fit.descriptor.ConfigurationParameter; 034import org.apache.uima.fit.factory.initializable.Initializable; 035import org.apache.uima.fit.factory.initializable.InitializableFactory; 036import org.apache.uima.resource.ResourceInitializationException; 037import org.cleartk.ml.jar.DirectoryDataWriterFactory; 038import org.cleartk.ml.jar.GenericJarClassifierFactory; 039import org.cleartk.util.CleartkInitializationException; 040import org.cleartk.util.ReflectionUtil; 041 042/** 043 * <br> 044 * Copyright (c) 2009-2011, Regents of the University of Colorado <br> 045 * All rights reserved. 046 * <p> 047 */ 048 049public abstract class CleartkSequenceAnnotator<OUTCOME_TYPE> extends JCasAnnotator_ImplBase 050 implements Initializable { 051 052 public static final String PARAM_CLASSIFIER_FACTORY_CLASS_NAME = "classifierFactoryClassName"; 053 054 private static final String DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME = "org.cleartk.ml.jar.SequenceJarClassifierFactory"; 055 056 @ConfigurationParameter( 057 name = PARAM_CLASSIFIER_FACTORY_CLASS_NAME, 058 mandatory = false, 059 description = "provides the full name of the SequenceClassifierFactory class to be used.", 060 defaultValue = "org.cleartk.ml.jar.SequenceJarClassifierFactory") 061 private String classifierFactoryClassName; 062 063 public static final String PARAM_DATA_WRITER_FACTORY_CLASS_NAME = "dataWriterFactoryClassName"; 064 065 private static final String DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME = "org.cleartk.ml.jar.DefaultSequenceDataWriterFactory"; 066 067 @ConfigurationParameter( 068 name = PARAM_DATA_WRITER_FACTORY_CLASS_NAME, 069 mandatory = false, 070 description = "provides the full name of the SequenceDataWriterFactory class to be used.", 071 defaultValue = DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME) 072 private String dataWriterFactoryClassName; 073 074 public static final String PARAM_IS_TRAINING = "isTraining"; 075 076 @ConfigurationParameter( 077 name = PARAM_IS_TRAINING, 078 mandatory = false, 079 description = "determines whether this annotator is writing training data or using a classifier to annotate. Normally inferred automatically based on whether or not a DataWriterFactory class has been set.") 080 private Boolean isTraining; 081 082 private boolean primitiveIsTraining; 083 084 protected SequenceDataWriter<OUTCOME_TYPE> dataWriter; 085 086 protected SequenceClassifier<OUTCOME_TYPE> classifier; 087 088 @Override 089 public void initialize(UimaContext context) throws ResourceInitializationException { 090 super.initialize(context); 091 092 if (dataWriterFactoryClassName == null && classifierFactoryClassName == null) { 093 CleartkInitializationException.neitherParameterSet( 094 PARAM_DATA_WRITER_FACTORY_CLASS_NAME, 095 dataWriterFactoryClassName, 096 PARAM_CLASSIFIER_FACTORY_CLASS_NAME, 097 classifierFactoryClassName); 098 } 099 100 // determine whether we start out as training or predicting 101 if (this.isTraining != null) { 102 this.primitiveIsTraining = this.isTraining; 103 } else if (!DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME.equals(this.dataWriterFactoryClassName)) { 104 this.primitiveIsTraining = true; 105 } else if (context.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY) != null) { 106 this.primitiveIsTraining = true; 107 } else if (!DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME.equals(this.classifierFactoryClassName)) { 108 this.primitiveIsTraining = false; 109 } else if (context.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH) != null) { 110 this.primitiveIsTraining = false; 111 } else { 112 String message = "Please specify PARAM_IS_TRAINING - unable to infer it from context"; 113 throw new IllegalArgumentException(message); 114 } 115 116 if (this.isTraining()) { 117 // create the factory and instantiate the data writer 118 SequenceDataWriterFactory<?> factory = InitializableFactory.create( 119 context, 120 dataWriterFactoryClassName, 121 SequenceDataWriterFactory.class); 122 SequenceDataWriter<?> untypedDataWriter; 123 try { 124 untypedDataWriter = factory.createDataWriter(); 125 } catch (IOException e) { 126 throw new ResourceInitializationException(e); 127 } 128 129 InitializableFactory.initialize(untypedDataWriter, context); 130 this.dataWriter = ReflectionUtil.uncheckedCast(untypedDataWriter); 131 } else { 132 // create the factory and instantiate the classifier 133 SequenceClassifierFactory<?> factory = InitializableFactory.create( 134 context, 135 classifierFactoryClassName, 136 SequenceClassifierFactory.class); 137 SequenceClassifier<?> untypedClassifier; 138 try { 139 untypedClassifier = factory.createClassifier(); 140 } catch (IOException e) { 141 throw new ResourceInitializationException(e); 142 } 143 144 this.classifier = ReflectionUtil.uncheckedCast(untypedClassifier); 145 ReflectionUtil.checkTypeParameterIsAssignable( 146 CleartkSequenceAnnotator.class, 147 "OUTCOME_TYPE", 148 this, 149 SequenceClassifier.class, 150 "OUTCOME_TYPE", 151 this.classifier); 152 InitializableFactory.initialize(untypedClassifier, context); 153 } 154 } 155 156 @Override 157 public void collectionProcessComplete() throws AnalysisEngineProcessException { 158 super.collectionProcessComplete(); 159 if (isTraining()) { 160 dataWriter.finish(); 161 } 162 } 163 164 protected boolean isTraining() { 165 return this.primitiveIsTraining; 166 } 167 168 protected List<OUTCOME_TYPE> classify(List<Instance<OUTCOME_TYPE>> instances) 169 throws CleartkProcessingException { 170 List<List<Feature>> instanceFeatures = new ArrayList<List<Feature>>(); 171 for (Instance<OUTCOME_TYPE> instance : instances) { 172 instanceFeatures.add(instance.getFeatures()); 173 } 174 return this.classifier.classify(instanceFeatures); 175 } 176 177}