001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.opennlp.tools; 025 026import java.io.IOException; 027import java.io.InputStream; 028import java.util.Iterator; 029import java.util.List; 030 031import opennlp.tools.parser.AbstractBottomUpParser; 032import opennlp.tools.parser.Parse; 033import opennlp.tools.parser.ParserModel; 034import opennlp.tools.util.Span; 035 036import org.apache.uima.UimaContext; 037import org.apache.uima.analysis_engine.AnalysisEngineDescription; 038import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 039import org.apache.uima.jcas.JCas; 040import org.apache.uima.jcas.tcas.Annotation; 041import org.apache.uima.resource.ResourceInitializationException; 042import org.cleartk.opennlp.tools.parser.CasPosTagger; 043import org.cleartk.opennlp.tools.parser.DefaultOutputTypesHelper; 044import org.cleartk.opennlp.tools.parser.InputTypesHelper; 045import org.cleartk.opennlp.tools.parser.OutputTypesHelper; 046import org.cleartk.opennlp.tools.parser.Parser; 047import org.cleartk.opennlp.tools.parser.ParserWrapper_ImplBase; 048import org.cleartk.util.IoUtil; 049import org.cleartk.util.ParamUtil; 050import org.apache.uima.fit.descriptor.ConfigurationParameter; 051import org.apache.uima.fit.factory.AnalysisEngineFactory; 052import org.apache.uima.fit.factory.initializable.InitializableFactory; 053 054import com.google.common.annotations.Beta; 055 056/** 057 * <br> 058 * Copyright (c) 2010, Regents of the University of Colorado <br> 059 * All rights reserved. 060 * 061 * This class provides a uima wrapper for the OpenNLP chunking parser that is specific to the 062 * ClearTK type system found in the cleartk-syntax project. However, by specifying your own 063 * implementations of {@link InputTypesHelper} and {@link OutputTypesHelper} you can use your own 064 * types for sentences, tokens, and part-of-speech tags. 065 * <p> 066 * The default behavior of the OpenNLP chunking parser is to perform part-of-speech tagging in 067 * addition to syntactic parsing. This may not be desirable in some situations where you have 068 * already populated the CAS with part-of-speech tags. In such cases you can set the configuration 069 * parameter with the name {@link #PARAM_USE_TAGS_FROM_CAS} to "true". This will bypass the 070 * OpenNLP's part-of-speech tagger and instead use tags from the CAS as defined by the 071 * implementation of {@link InputTypesHelper} you are using. 072 * 073 * @author Philipp Wetzler, Philip Ogren. 074 */ 075@Beta 076public class ParserAnnotator<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation, TOP_NODE_TYPE extends Annotation> 077 extends ParserWrapper_ImplBase<TOKEN_TYPE, SENTENCE_TYPE, Parse, TOP_NODE_TYPE> { 078 079 public static final String DEFAULT_PARSER_MODEL_PATH = "/models/en-parser-chunking.bin"; 080 081 public static final int DEFAULT_BEAM_SIZE = AbstractBottomUpParser.defaultBeamSize; 082 083 public static final float DEFAULT_ADVANCE_PERCENTAGE = (float) AbstractBottomUpParser.defaultAdvancePercentage; 084 085 public static final String PARAM_PARSER_MODEL_PATH = "parserModelPath"; 086 087 @ConfigurationParameter( 088 name = PARAM_PARSER_MODEL_PATH, 089 defaultValue = DEFAULT_PARSER_MODEL_PATH, 090 description = "provides the path of the OpenNLP parser model build file, e.g. /models/en-parser-chunking.bin. See javadoc for opennlp.tools.parser.chunking.Parser.") 091 private String parserModelPath; 092 093 public static final String PARAM_BEAM_SIZE = "beamSize"; 094 095 @ConfigurationParameter( 096 name = PARAM_BEAM_SIZE, 097 defaultValue = "" + DEFAULT_BEAM_SIZE, 098 description = "indicates the beam size that should be used in the parser's search. See javadoc for opennlp.tools.parser.chunking.Parser.") 099 private int beamSize; 100 101 public static final String PARAM_ADVANCE_PERCENTAGE = "advancePercentage"; 102 103 @ConfigurationParameter( 104 name = PARAM_ADVANCE_PERCENTAGE, 105 defaultValue = "" + AbstractBottomUpParser.defaultAdvancePercentage, 106 description = "indicates \"the amount of probability mass required of advanced outcomes\". See javadoc for opennlp.tools.parser.chunking.Parser.") 107 private float advancePercentage; 108 109 public static final String PARAM_USE_TAGS_FROM_CAS = "useTagsFromCas"; 110 111 @ConfigurationParameter( 112 name = PARAM_USE_TAGS_FROM_CAS, 113 defaultValue = "false", 114 description = "determines whether or not part-of-speech tags that are already in the CAS will be used or not.") 115 private boolean useTagsFromCas; 116 117 protected Parser parser; 118 119 protected CasPosTagger<TOKEN_TYPE, SENTENCE_TYPE> casTagger; 120 121 @SuppressWarnings("unchecked") 122 @Override 123 public void initialize(UimaContext ctx) throws ResourceInitializationException { 124 super.initialize(ctx); 125 126 inputTypesHelper = InitializableFactory.create( 127 ctx, 128 inputTypesHelperClassName, 129 InputTypesHelper.class); 130 131 try { 132 InputStream modelInputStream = IoUtil.getInputStream(ParserAnnotator.class, parserModelPath); 133 ParserModel parserModel = new ParserModel(modelInputStream); 134 if (useTagsFromCas) { 135 this.casTagger = new CasPosTagger<TOKEN_TYPE, SENTENCE_TYPE>(inputTypesHelper); 136 this.parser = new Parser(parserModel, beamSize, advancePercentage, casTagger); 137 } else { 138 this.parser = new Parser(parserModel, beamSize, advancePercentage); 139 } 140 } catch (IOException e) { 141 throw new ResourceInitializationException(e); 142 } 143 } 144 145 @Override 146 public void process(JCas jCas) throws AnalysisEngineProcessException { 147 String text = jCas.getDocumentText(); 148 149 List<SENTENCE_TYPE> sentenceList = inputTypesHelper.getSentences(jCas); 150 151 for (SENTENCE_TYPE sentence : sentenceList) { 152 153 Parse parse = new Parse( 154 text, 155 new Span(sentence.getBegin(), sentence.getEnd()), 156 AbstractBottomUpParser.INC_NODE, 157 1, 158 null); 159 160 List<TOKEN_TYPE> tokenList = inputTypesHelper.getTokens(jCas, sentence); 161 162 for (TOKEN_TYPE token : tokenList) { 163 parse.insert(new Parse( 164 text, 165 new Span(token.getBegin(), token.getEnd()), 166 AbstractBottomUpParser.TOK_NODE, 167 0, 168 0)); 169 } 170 171 if (useTagsFromCas) { 172 this.casTagger.setTokens(tokenList); 173 } 174 175 parse = this.parser.parse(parse); 176 177 // if the sentence was successfully parsed, add the tree to the 178 // sentence 179 if (parse.getType() == AbstractBottomUpParser.TOP_NODE) { 180 outputTypesHelper.addParse(jCas, parse, sentence, tokenList); 181 } 182 183 // add the POS tags to the tokens 184 if (!useTagsFromCas) { 185 setPOSTags(parse, tokenList.iterator(), jCas); 186 } 187 } 188 } 189 190 protected void setPOSTags(Parse p, Iterator<TOKEN_TYPE> tokenIterator, JCas view) { 191 if (p.isPosTag()) { 192 TOKEN_TYPE t = tokenIterator.next(); 193 inputTypesHelper.setPosTag(t, p.getType()); 194 } else { 195 for (Parse child : p.getChildren()) { 196 setPOSTags(child, tokenIterator, view); 197 } 198 } 199 } 200 201 public static AnalysisEngineDescription getDescription() throws ResourceInitializationException { 202 return AnalysisEngineFactory.createEngineDescription( 203 ParserAnnotator.class, 204 PARAM_PARSER_MODEL_PATH, 205 ParamUtil.getParameterValue(PARAM_PARSER_MODEL_PATH, "/models/en-parser-chunking.bin"), 206 PARAM_OUTPUT_TYPES_HELPER_CLASS_NAME, 207 DefaultOutputTypesHelper.class.getName()); 208 } 209}