001/** 
002 * Copyright (c) 2009, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.token.pos;
025
026import java.util.ArrayList;
027import java.util.List;
028
029import org.apache.uima.UimaContext;
030import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
031import org.apache.uima.cas.FSIterator;
032import org.apache.uima.cas.Type;
033import org.apache.uima.jcas.JCas;
034import org.apache.uima.jcas.cas.TOP;
035import org.apache.uima.jcas.tcas.Annotation;
036import org.apache.uima.resource.ResourceInitializationException;
037import org.cleartk.ml.CleartkSequenceAnnotator;
038import org.cleartk.ml.Feature;
039import org.cleartk.ml.Instance;
040import org.cleartk.util.ReflectionUtil;
041import org.apache.uima.fit.descriptor.ConfigurationParameter;
042import org.apache.uima.fit.factory.initializable.InitializableFactory;
043import org.apache.uima.fit.util.JCasUtil;
044
045/**
046 * <br>
047 * Copyright (c) 2009, Regents of the University of Colorado <br>
048 * All rights reserved.
049 * 
050 * @author Philip Ogren
051 * 
052 */
053
054public abstract class PosAnnotator<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation>
055    extends CleartkSequenceAnnotator<String> {
056
057  public static final String PARAM_FEATURE_EXTRACTOR_CLASS_NAME = "featureExtractorClassName";
058
059  @ConfigurationParameter(
060      name = PARAM_FEATURE_EXTRACTOR_CLASS_NAME,
061      mandatory = true, description = "provides the full name of the class that will be used to extract features", defaultValue = "org.cleartk.token.pos.impl.DefaultFeatureExtractor")
062  private String featureExtractorClassName;
063
064  protected PosFeatureExtractor<TOKEN_TYPE, SENTENCE_TYPE> featureExtractor;
065
066  private Class<? extends TOP> tokenClass;
067
068  private Class<? extends TOP> sentenceClass;
069
070  protected boolean typesInitialized = false;
071
072  protected Type tokenType;
073
074  protected Type sentenceType;
075
076  @Override
077  public void initialize(UimaContext context) throws ResourceInitializationException {
078    super.initialize(context);
079
080    // extract the token and sentence classes from the type parameters
081    this.tokenClass = ReflectionUtil.<Class<? extends TOP>> uncheckedCast(ReflectionUtil
082        .getTypeArgument(PosAnnotator.class, "TOKEN_TYPE", this));
083    this.sentenceClass = ReflectionUtil.<Class<? extends TOP>> uncheckedCast(ReflectionUtil
084        .getTypeArgument(PosAnnotator.class, "SENTENCE_TYPE", this));
085
086    // create the feature extractor and tagger
087    PosFeatureExtractor<?, ?> untypedExtractor = InitializableFactory.create(
088        context,
089        featureExtractorClassName,
090        PosFeatureExtractor.class);
091
092    // check that the type parameters are compatible
093    ReflectionUtil.checkTypeParameterIsAssignable(
094        PosFeatureExtractor.class,
095        "TOKEN_TYPE",
096        untypedExtractor,
097        PosAnnotator.class,
098        "TOKEN_TYPE",
099        this);
100    ReflectionUtil.checkTypeParameterIsAssignable(
101        PosFeatureExtractor.class,
102        "SENTENCE_TYPE",
103        untypedExtractor,
104        PosAnnotator.class,
105        "SENTENCE_TYPE",
106        this);
107
108    // set the instance variables
109    this.featureExtractor = ReflectionUtil.uncheckedCast(untypedExtractor);
110  }
111
112  protected void initializeTypes(JCas jCas) throws AnalysisEngineProcessException {
113    try {
114      tokenType = JCasUtil.getType(jCas, this.tokenClass);
115      sentenceType = JCasUtil.getType(jCas, this.sentenceClass);
116    } catch (Exception e) {
117      throw new AnalysisEngineProcessException(e);
118    }
119    typesInitialized = true;
120  }
121
122  @Override
123  public void process(JCas jCas) throws AnalysisEngineProcessException {
124    if (!typesInitialized)
125      initializeTypes(jCas);
126
127    FSIterator<Annotation> sentences = jCas.getAnnotationIndex(sentenceType).iterator();
128    while (sentences.hasNext()) {
129      @SuppressWarnings("unchecked")
130      SENTENCE_TYPE sentence = (SENTENCE_TYPE) sentences.next();
131
132      List<Instance<String>> instances = new ArrayList<Instance<String>>();
133
134      FSIterator<Annotation> tokens = jCas.getAnnotationIndex(tokenType).subiterator(sentence);
135
136      while (tokens.hasNext()) {
137        @SuppressWarnings("unchecked")
138        TOKEN_TYPE token = (TOKEN_TYPE) tokens.next();
139        List<Feature> features = featureExtractor.extractFeatures(jCas, token, sentence);
140        Instance<String> instance = new Instance<String>();
141        instance.addAll(features);
142        instance.setOutcome(getTag(jCas, token));
143        instances.add(instance);
144      }
145
146      if (this.isTraining()) {
147        this.dataWriter.write(instances);
148      } else {
149        List<String> tags = this.classify(instances);
150        tokens.moveToFirst();
151        for (int i = 0; tokens.hasNext(); i++) {
152          @SuppressWarnings("unchecked")
153          TOKEN_TYPE token = (TOKEN_TYPE) tokens.next();
154          setTag(jCas, token, tags.get(i));
155        }
156      }
157    }
158  }
159
160  public abstract void setTag(JCas jCas, TOKEN_TYPE token, String tag);
161
162  public abstract String getTag(JCas jCas, TOKEN_TYPE token);
163
164  public void setFeatureExtractorClassName(String featureExtractorClassName) {
165    this.featureExtractorClassName = featureExtractorClassName;
166  }
167
168}