001/* 002 * Copyright (c) 2012, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.clearnlp; 025 026import java.util.List; 027 028import org.apache.uima.UimaContext; 029import org.apache.uima.analysis_engine.AnalysisEngineDescription; 030import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 031import org.apache.uima.jcas.JCas; 032import org.apache.uima.jcas.tcas.Annotation; 033import org.apache.uima.resource.ResourceInitializationException; 034import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 035import org.apache.uima.fit.descriptor.ConfigurationParameter; 036import org.apache.uima.fit.factory.AnalysisEngineFactory; 037import org.apache.uima.fit.util.JCasUtil; 038 039import com.google.common.annotations.Beta; 040import com.clearnlp.component.AbstractComponent; 041import com.clearnlp.dependency.DEPNode; 042import com.clearnlp.dependency.DEPTree; 043import com.clearnlp.nlp.NLPGetter; 044import com.clearnlp.nlp.NLPLib; 045import com.clearnlp.reader.AbstractReader; 046 047/** 048 * <br> 049 * Copyright (c) 2012, Regents of the University of Colorado <br> 050 * All rights reserved. 051 * <p> 052 * This class provides a base class for wrapping the ClearNLP morphological analyzer into a UIMA 053 * based type system. This engine requires POS-tagged tokens and produces lemmatized forms of said 054 * tokens. 055 * 056 * Subclasses should override the abstract methods to produce the annotations relevant for the 057 * target type system. 058 * 059 * This analyzer is available here: 060 * <p> 061 * http://clearnlp.googlecode.com 062 * <p> 063 * 064 * @author Lee Becker 065 * 066 */ 067@Beta 068public abstract class MpAnalyzer_ImplBase<TOKEN_TYPE extends Annotation> extends 069 JCasAnnotator_ImplBase { 070 071 public static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary-1.2.0.zip"; 072 073 public static final String PARAM_LANGUAGE_CODE = "languageCode"; 074 075 @ConfigurationParameter( 076 name = PARAM_LANGUAGE_CODE, 077 mandatory = false, 078 description = "Language code (default value=en).", 079 defaultValue = AbstractReader.LANG_EN) 080 private String languageCode; 081 082 /* 083 public static final String PARAM_DICTIONARY_URI = "dictionaryUri"; 084 085 @ConfigurationParameter( 086 name = PARAM_DICTIONARY_URI, 087 mandatory = false, 088 description = "This parameter provides the URI to the morphological analyzer dictionary used for lemmatizing.") 089 private URI dictionaryUri; 090 */ 091 092 public static final String PARAM_WINDOW_CLASS = "windowClass"; 093 094 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 095 + "By default, the tokenizer will tokenize a document sentence by sentence. If you do not want to precede tokenization with" 096 + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'"; 097 098 @ConfigurationParameter( 099 name = PARAM_WINDOW_CLASS, 100 mandatory = false, 101 description = WINDOW_TYPE_DESCRIPTION, 102 defaultValue = "org.cleartk.token.type.Sentence") 103 private Class<? extends Annotation> windowClass; 104 105 /** 106 * Convenience method to create Analysis Engine for ClearNLP's POSTagger + Lemmatizer using 107 * default English models and dictionaries. 108 */ 109 public static AnalysisEngineDescription getDescription() throws ResourceInitializationException { 110 return AnalysisEngineFactory.createEngineDescription(MpAnalyzer_ImplBase.class); 111 } 112 113 public static AnalysisEngineDescription getDescription(String langCode) 114 throws ResourceInitializationException { 115 return AnalysisEngineFactory.createEngineDescription( 116 MpAnalyzer_ImplBase.class, 117 MpAnalyzer_ImplBase.PARAM_LANGUAGE_CODE, 118 langCode); 119 120 } 121 122 private TokenOps<TOKEN_TYPE> tokenOps; 123 124 public MpAnalyzer_ImplBase(TokenOps<TOKEN_TYPE> tokenOps) { 125 this.tokenOps = tokenOps; 126 } 127 128 @Override 129 public void initialize(UimaContext context) throws ResourceInitializationException { 130 super.initialize(context); 131 132 try { 133 134 // initialize ClearNLP components 135 this.mpAnalyzer = NLPGetter.getComponent( 136 "", 137 languageCode, 138 NLPLib.MODE_MORPH); 139 140 } catch (Exception e) { 141 throw new ResourceInitializationException(e); 142 } 143 144 } 145 146 @Override 147 public void process(JCas jCas) throws AnalysisEngineProcessException { 148 for (Annotation window : JCasUtil.select(jCas, this.windowClass)) { 149 List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window); 150 List<String> tokenStrings = JCasUtil.toText(tokens); 151 152 // All processing in ClearNLP goes through the DEPTree structures, 153 // so populate it with token and POS tag info 154 DEPTree depTree = NLPGetter.toDEPTree(tokenStrings); 155 for (int i = 1; i < depTree.size(); i++) { 156 TOKEN_TYPE token = tokens.get(i - 1); 157 DEPNode node = depTree.get(i); 158 node.pos = this.tokenOps.getPos(jCas, token); 159 } 160 // Run the morphological analyzer 161 this.mpAnalyzer.process(depTree); 162 163 // Pull out lemmas and stuff them back into the CAS tokens 164 for (int i = 1; i < depTree.size(); i++) { 165 TOKEN_TYPE token = tokens.get(i - 1); 166 DEPNode node = depTree.get(i); 167 this.tokenOps.setLemma(jCas, token, node.lemma); 168 } 169 } 170 } 171 172 private AbstractComponent mpAnalyzer; 173 174}