001/* 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.clearnlp; 025 026import java.util.List; 027 028import org.apache.uima.UimaContext; 029import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 030import org.apache.uima.jcas.JCas; 031import org.apache.uima.jcas.tcas.Annotation; 032import org.apache.uima.resource.ResourceInitializationException; 033import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 034import org.apache.uima.fit.descriptor.ConfigurationParameter; 035import org.apache.uima.fit.util.JCasUtil; 036 037import com.google.common.annotations.Beta; 038import com.clearnlp.component.AbstractComponent; 039import com.clearnlp.dependency.DEPNode; 040import com.clearnlp.dependency.DEPTree; 041import com.clearnlp.nlp.NLPGetter; 042import com.clearnlp.nlp.NLPLib; 043import com.clearnlp.reader.AbstractReader; 044 045/** 046 * <br> 047 * Copyright (c) 2012, Regents of the University of Colorado <br> 048 * All rights reserved. 049 * <p> 050 * This class provides a UIMA/ClearTK wrapper for the ClearNLP part of speech (POS) tagger. This 051 * engine requires tokenize input and produces POS tags on the tokens. 052 * 053 * Subclasses should override the abstract methods to produce the annotations relevant for the 054 * target type system. 055 * 056 * This tagger is available here: 057 * <p> 058 * http://clearnlp.googlecode.com 059 * <p> 060 * 061 * @author Lee Becker 062 * 063 */ 064@Beta 065public abstract class PosTagger_ImplBase<TOKEN_TYPE extends Annotation> extends 066 JCasAnnotator_ImplBase { 067 068 public static final String DEFAULT_MODEL_PATH = "general-en"; 069 070 public static final String PARAM_MODEL_PATH = "modelPath"; 071 072 @ConfigurationParameter( 073 name = PARAM_MODEL_PATH, 074 mandatory = false, 075 description = "This parameter provides the path to the pos tagger model.", 076 defaultValue=DEFAULT_MODEL_PATH) 077 private String modelPath; 078 079 080 /* 081 public static final String PARAM_MODEL_URI = "modelUri"; 082 083 @ConfigurationParameter( 084 name = PARAM_MODEL_URI, 085 mandatory = false, 086 description = "This parameter provides the URI to the pos tagger model.") 087 private URI modelUri; 088 */ 089 090 public static final String PARAM_LANGUAGE_CODE = "languageCode"; 091 092 @ConfigurationParameter( 093 name = PARAM_LANGUAGE_CODE, 094 mandatory = false, 095 description = "Language code for the pos tagger (default value=en).", 096 defaultValue = AbstractReader.LANG_EN) 097 private String languageCode; 098 099 public static final String PARAM_WINDOW_CLASS = "windowClass"; 100 101 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 102 + "By default, the tokenizer will tokenize a document sentence by sentence. If you do not want to precede tokenization with" 103 + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'"; 104 105 @ConfigurationParameter( 106 name = PARAM_WINDOW_CLASS, 107 mandatory = false, 108 description = WINDOW_TYPE_DESCRIPTION, 109 defaultValue = "org.cleartk.token.type.Sentence") 110 private Class<? extends Annotation> windowClass; 111 112 private TokenOps<TOKEN_TYPE> tokenOps; 113 114 public PosTagger_ImplBase(TokenOps<TOKEN_TYPE> tokenOps) { 115 this.tokenOps = tokenOps; 116 } 117 118 @Override 119 public void initialize(UimaContext context) throws ResourceInitializationException { 120 super.initialize(context); 121 try { 122 // Load POS tagger model 123 this.tagger = NLPGetter.getComponent(modelPath, languageCode, NLPLib.MODE_POS); 124 125 } catch (Exception e) { 126 throw new ResourceInitializationException(e); 127 } 128 129 } 130 131 @Override 132 public void process(JCas jCas) throws AnalysisEngineProcessException { 133 134 for (Annotation window : JCasUtil.select(jCas, this.windowClass)) { 135 List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window); 136 if (tokens.size() <= 0) { 137 return; 138 } 139 140 List<String> tokenStrings = JCasUtil.toText(tokens); 141 142 // As of version 1.3.0, ClearNLP does all processing to go through its own dependency tree 143 // structure 144 DEPTree clearNlpDepTree = NLPGetter.toDEPTree(tokenStrings); 145 this.tagger.process(clearNlpDepTree); 146 147 // Note the ClearNLP counts index 0 as the sentence dependency node, so the POS tag indices 148 // are shifted by one from the token indices 149 for (int i = 0; i < tokens.size(); i++) { 150 TOKEN_TYPE token = tokens.get(i); 151 DEPNode node = clearNlpDepTree.get(i+1); 152 this.tokenOps.setPos(jCas, token, node.pos); 153 this.tokenOps.setLemma(jCas, token, node.lemma); 154 } 155 } 156 } 157 158 private AbstractComponent tagger; 159 160}