001/** 
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.ml.multi;
025
026import java.io.File;
027import java.io.IOException;
028import java.util.HashMap;
029import java.util.Map;
030
031import org.apache.uima.UimaContext;
032import org.apache.uima.UimaContextAdmin;
033import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
034import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
035import org.apache.uima.fit.descriptor.ConfigurationParameter;
036import org.apache.uima.fit.factory.initializable.Initializable;
037import org.apache.uima.fit.factory.initializable.InitializableFactory;
038import org.apache.uima.resource.ConfigurationManager;
039import org.apache.uima.resource.ResourceInitializationException;
040import org.cleartk.ml.Classifier;
041import org.cleartk.ml.ClassifierFactory;
042import org.cleartk.ml.CleartkAnnotator;
043import org.cleartk.ml.CleartkProcessingException;
044import org.cleartk.ml.DataWriter;
045import org.cleartk.ml.DataWriterFactory;
046import org.cleartk.ml.jar.DirectoryDataWriterFactory;
047import org.cleartk.ml.jar.GenericJarClassifierFactory;
048import org.cleartk.util.CleartkInitializationException;
049import org.cleartk.util.ReflectionUtil;
050
051/**
052 * <br>
053 * Copyright (c) 2011, Regents of the University of Colorado <br>
054 * All rights reserved.
055 * 
056 * @author Lee Becker
057 * 
058 *         This class provides a framework for handling multiple classifiers within a single
059 *         analysis engine.
060 *         <p>
061 * 
062 *         Use cases that may lend themselves to {@link CleartkMultiAnnotator} over
063 *         {@link CleartkAnnotator} include:
064 *         <ul>
065 *         <li>Explicit control for one-vs-all multi-label classification
066 *         <li>Predicating training/classification on specific conditions. (e.g. if the the word to
067 *         the left is a verb, use the verb classifier, else use the default classifier)
068 *         <li>Voting or ensemble classification
069 *         </ul>
070 */
071public abstract class CleartkMultiAnnotator<OUTCOME_TYPE> extends JCasAnnotator_ImplBase implements
072    Initializable {
073
074  public static final String PARAM_CLASSIFIER_FACTORY_CLASS_NAME = "classifierFactoryClassName";
075
076  private static final String DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME = "org.cleartk.ml.jar.JarClassifierFactory";
077
078  @ConfigurationParameter(
079      name = PARAM_CLASSIFIER_FACTORY_CLASS_NAME,
080      mandatory = false,
081      description = "provides the full name of the ClassifierFactory class to be used.",
082      defaultValue = DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME)
083  private String classifierFactoryClassName;
084
085  public static final String PARAM_DATA_WRITER_FACTORY_CLASS_NAME = "dataWriterFactoryClassName";
086
087  private static final String DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME = "org.cleartk.ml.jar.DefaultDataWriterFactory";
088
089  @ConfigurationParameter(
090      name = PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
091      mandatory = false,
092      description = "provides the full name of the DataWriterFactory class to be used.",
093      defaultValue = DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME)
094  private String dataWriterFactoryClassName;
095
096  public static final String PARAM_IS_TRAINING = "isTraining";
097
098  @ConfigurationParameter(
099      name = PARAM_IS_TRAINING,
100      mandatory = false,
101      description = "determines whether this annotator is writing training data or using a classifier to annotate. Normally inferred automatically based on whether or not a DataWriterFactory class has been set.")
102  private Boolean isTraining;
103
104  private boolean primitiveIsTraining;
105
106  protected ClassifierFactory<?> classifierFactory;
107
108  protected DataWriterFactory<?> dataWriterFactory;
109
110  protected Map<String, Classifier<OUTCOME_TYPE>> classifiers;
111
112  protected Map<String, DataWriter<OUTCOME_TYPE>> dataWriters;
113
114  private UimaContext uimaContext;
115
116  protected File outputDirectoryRoot;
117
118  protected File classifierJarPathRoot;
119
120  @Override
121  public void initialize(UimaContext context) throws ResourceInitializationException {
122    super.initialize(context);
123
124    if (this.dataWriterFactoryClassName == null && this.classifierFactoryClassName == null) {
125      CleartkInitializationException.neitherParameterSet(
126          PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
127          this.dataWriterFactoryClassName,
128          PARAM_CLASSIFIER_FACTORY_CLASS_NAME,
129          this.classifierFactoryClassName);
130    }
131
132    // determine whether we start out as training or predicting
133    if (this.isTraining != null) {
134      this.primitiveIsTraining = this.isTraining;
135    } else if (!DEFAULT_DATA_WRITER_FACTORY_CLASS_NAME.equals(this.dataWriterFactoryClassName)) {
136      this.primitiveIsTraining = true;
137    } else if (context.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY) != null) {
138      this.primitiveIsTraining = true;
139    } else if (!DEFAULT_CLASSIFIER_FACTORY_CLASS_NAME.equals(this.classifierFactoryClassName)) {
140      this.primitiveIsTraining = false;
141    } else if (context.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH) != null) {
142      this.primitiveIsTraining = false;
143    } else {
144      String message = "Please specify PARAM_IS_TRAINING - unable to infer it from context";
145      throw new IllegalArgumentException(message);
146    }
147
148    this.uimaContext = context;
149    UimaContextAdmin contextAdmin = (UimaContextAdmin) this.uimaContext;
150    ConfigurationManager manager = contextAdmin.getConfigurationManager();
151
152    if (this.isTraining()) {
153      // Create the data writer factory and initialize a Map to hold the data writers
154      // Individual data writers will be created dynamically with the getDataWriter method
155      this.dataWriters = new HashMap<String, DataWriter<OUTCOME_TYPE>>();
156      this.outputDirectoryRoot = (File) manager.getConfigParameterValue(DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY);
157      this.dataWriterFactory = InitializableFactory.create(
158          context,
159          dataWriterFactoryClassName,
160          DataWriterFactory.class);
161    } else {
162
163      // Create the classifier factory and initialize a map to hold the classifiers
164      // Individual classifiers will be created dynamically with the getClassifier method
165      this.classifiers = new HashMap<String, Classifier<OUTCOME_TYPE>>();
166      this.classifierJarPathRoot = (File) manager.getConfigParameterValue(GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH);
167
168      this.classifierFactory = InitializableFactory.create(
169          context,
170          classifierFactoryClassName,
171          ClassifierFactory.class);
172    }
173  }
174
175  @Override
176  public void collectionProcessComplete() throws AnalysisEngineProcessException {
177    super.collectionProcessComplete();
178    if (this.isTraining()) {
179      try {
180        for (DataWriter<OUTCOME_TYPE> dataWriter : dataWriters.values()) {
181          dataWriter.finish();
182        }
183      } catch (CleartkProcessingException ctke) {
184        throw new AnalysisEngineProcessException(ctke);
185      }
186    }
187  }
188
189  protected boolean isTraining() {
190    return this.primitiveIsTraining;
191  }
192
193  /**
194   * Gets the classifier associated with name. If it does not exist, this method will use the
195   * {@link ClassifierFactory} specified at initialization to create a new one.
196   * 
197   * @param name
198   *          The name of the classifier
199   * @return The classifier associated with the name
200   */
201  protected Classifier<OUTCOME_TYPE> getClassifier(String name)
202      throws ResourceInitializationException {
203    if (classifiers.containsKey(name)) {
204      return classifiers.get(name);
205    }
206
207    File classifierJarPath = new File(this.classifierJarPathRoot, name);
208    UimaContextAdmin contextAdmin = (UimaContextAdmin) this.uimaContext;
209    ConfigurationManager manager = contextAdmin.getConfigurationManager();
210    manager.setConfigParameterValue(contextAdmin.getQualifiedContextName()
211        + GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, classifierJarPath.getPath());
212
213    // create the factory and instantiate the classifier
214    ClassifierFactory<?> factory = InitializableFactory.create(
215        uimaContext,
216        classifierFactoryClassName,
217        ClassifierFactory.class);
218    Classifier<?> untypedClassifier;
219    try {
220      untypedClassifier = factory.createClassifier();
221    } catch (IOException e) {
222      throw new ResourceInitializationException(e);
223    }
224
225    Classifier<OUTCOME_TYPE> classifier = ReflectionUtil.uncheckedCast(untypedClassifier);
226    ReflectionUtil.checkTypeParameterIsAssignable(
227        CleartkMultiAnnotator.class,
228        "OUTCOME_TYPE",
229        this,
230        Classifier.class,
231        "OUTCOME_TYPE",
232        classifier);
233    InitializableFactory.initialize(untypedClassifier, this.getContext());
234    this.classifiers.put(name, classifier);
235    return classifier;
236  }
237
238  /**
239   * Gets the {@link DataWriter} associated with name. If it does not exist, this method will use
240   * the {@link DataWriterFactory} specified during initialization to create a dataWriter associated
241   * with the name parameter.
242   * 
243   * @param name
244   *          The name of the {@link DataWriter}
245   * @return The {@link DataWriter} associated with the name
246   */
247  protected DataWriter<OUTCOME_TYPE> getDataWriter(String name)
248      throws ResourceInitializationException {
249    if (dataWriters.containsKey(name)) {
250      return dataWriters.get(name);
251    }
252
253    DataWriter<?> untypedDataWriter;
254    File dataWriterPath = new File(this.outputDirectoryRoot, name);
255    UimaContextAdmin contextAdmin = (UimaContextAdmin) this.uimaContext;
256    ConfigurationManager manager = contextAdmin.getConfigurationManager();
257    manager.setConfigParameterValue(contextAdmin.getQualifiedContextName()
258        + DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, dataWriterPath);
259
260    try {
261      untypedDataWriter = this.dataWriterFactory.createDataWriter();
262    } catch (IOException e) {
263      throw new ResourceInitializationException(e);
264    }
265
266    InitializableFactory.initialize(untypedDataWriter, uimaContext);
267    DataWriter<OUTCOME_TYPE> dataWriter = ReflectionUtil.uncheckedCast(untypedDataWriter);
268    this.dataWriters.put(name, dataWriter);
269    return dataWriter;
270  }
271
272}