001/** 002 * Copyright (c) 2009, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.token.pos; 025 026import java.util.ArrayList; 027import java.util.List; 028 029import org.apache.uima.UimaContext; 030import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 031import org.apache.uima.cas.FSIterator; 032import org.apache.uima.cas.Type; 033import org.apache.uima.jcas.JCas; 034import org.apache.uima.jcas.cas.TOP; 035import org.apache.uima.jcas.tcas.Annotation; 036import org.apache.uima.resource.ResourceInitializationException; 037import org.cleartk.ml.CleartkSequenceAnnotator; 038import org.cleartk.ml.Feature; 039import org.cleartk.ml.Instance; 040import org.cleartk.util.ReflectionUtil; 041import org.apache.uima.fit.descriptor.ConfigurationParameter; 042import org.apache.uima.fit.factory.initializable.InitializableFactory; 043import org.apache.uima.fit.util.JCasUtil; 044 045/** 046 * <br> 047 * Copyright (c) 2009, Regents of the University of Colorado <br> 048 * All rights reserved. 049 * 050 * @author Philip Ogren 051 * 052 */ 053 054public abstract class PosAnnotator<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation> 055 extends CleartkSequenceAnnotator<String> { 056 057 public static final String PARAM_FEATURE_EXTRACTOR_CLASS_NAME = "featureExtractorClassName"; 058 059 @ConfigurationParameter( 060 name = PARAM_FEATURE_EXTRACTOR_CLASS_NAME, 061 mandatory = true, description = "provides the full name of the class that will be used to extract features", defaultValue = "org.cleartk.token.pos.impl.DefaultFeatureExtractor") 062 private String featureExtractorClassName; 063 064 protected PosFeatureExtractor<TOKEN_TYPE, SENTENCE_TYPE> featureExtractor; 065 066 private Class<? extends TOP> tokenClass; 067 068 private Class<? extends TOP> sentenceClass; 069 070 protected boolean typesInitialized = false; 071 072 protected Type tokenType; 073 074 protected Type sentenceType; 075 076 @Override 077 public void initialize(UimaContext context) throws ResourceInitializationException { 078 super.initialize(context); 079 080 // extract the token and sentence classes from the type parameters 081 this.tokenClass = ReflectionUtil.<Class<? extends TOP>> uncheckedCast(ReflectionUtil 082 .getTypeArgument(PosAnnotator.class, "TOKEN_TYPE", this)); 083 this.sentenceClass = ReflectionUtil.<Class<? extends TOP>> uncheckedCast(ReflectionUtil 084 .getTypeArgument(PosAnnotator.class, "SENTENCE_TYPE", this)); 085 086 // create the feature extractor and tagger 087 PosFeatureExtractor<?, ?> untypedExtractor = InitializableFactory.create( 088 context, 089 featureExtractorClassName, 090 PosFeatureExtractor.class); 091 092 // check that the type parameters are compatible 093 ReflectionUtil.checkTypeParameterIsAssignable( 094 PosFeatureExtractor.class, 095 "TOKEN_TYPE", 096 untypedExtractor, 097 PosAnnotator.class, 098 "TOKEN_TYPE", 099 this); 100 ReflectionUtil.checkTypeParameterIsAssignable( 101 PosFeatureExtractor.class, 102 "SENTENCE_TYPE", 103 untypedExtractor, 104 PosAnnotator.class, 105 "SENTENCE_TYPE", 106 this); 107 108 // set the instance variables 109 this.featureExtractor = ReflectionUtil.uncheckedCast(untypedExtractor); 110 } 111 112 protected void initializeTypes(JCas jCas) throws AnalysisEngineProcessException { 113 try { 114 tokenType = JCasUtil.getType(jCas, this.tokenClass); 115 sentenceType = JCasUtil.getType(jCas, this.sentenceClass); 116 } catch (Exception e) { 117 throw new AnalysisEngineProcessException(e); 118 } 119 typesInitialized = true; 120 } 121 122 @Override 123 public void process(JCas jCas) throws AnalysisEngineProcessException { 124 if (!typesInitialized) 125 initializeTypes(jCas); 126 127 FSIterator<Annotation> sentences = jCas.getAnnotationIndex(sentenceType).iterator(); 128 while (sentences.hasNext()) { 129 @SuppressWarnings("unchecked") 130 SENTENCE_TYPE sentence = (SENTENCE_TYPE) sentences.next(); 131 132 List<Instance<String>> instances = new ArrayList<Instance<String>>(); 133 134 FSIterator<Annotation> tokens = jCas.getAnnotationIndex(tokenType).subiterator(sentence); 135 136 while (tokens.hasNext()) { 137 @SuppressWarnings("unchecked") 138 TOKEN_TYPE token = (TOKEN_TYPE) tokens.next(); 139 List<Feature> features = featureExtractor.extractFeatures(jCas, token, sentence); 140 Instance<String> instance = new Instance<String>(); 141 instance.addAll(features); 142 instance.setOutcome(getTag(jCas, token)); 143 instances.add(instance); 144 } 145 146 if (this.isTraining()) { 147 this.dataWriter.write(instances); 148 } else { 149 List<String> tags = this.classify(instances); 150 tokens.moveToFirst(); 151 for (int i = 0; tokens.hasNext(); i++) { 152 @SuppressWarnings("unchecked") 153 TOKEN_TYPE token = (TOKEN_TYPE) tokens.next(); 154 setTag(jCas, token, tags.get(i)); 155 } 156 } 157 } 158 } 159 160 public abstract void setTag(JCas jCas, TOKEN_TYPE token, String tag); 161 162 public abstract String getTag(JCas jCas, TOKEN_TYPE token); 163 164 public void setFeatureExtractorClassName(String featureExtractorClassName) { 165 this.featureExtractorClassName = featureExtractorClassName; 166 } 167 168}