001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.opennlp.tools;
025
026import java.io.IOException;
027import java.io.InputStream;
028import java.util.Iterator;
029import java.util.List;
030
031import opennlp.tools.parser.AbstractBottomUpParser;
032import opennlp.tools.parser.Parse;
033import opennlp.tools.parser.ParserModel;
034import opennlp.tools.util.Span;
035
036import org.apache.uima.UimaContext;
037import org.apache.uima.analysis_engine.AnalysisEngineDescription;
038import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
039import org.apache.uima.jcas.JCas;
040import org.apache.uima.jcas.tcas.Annotation;
041import org.apache.uima.resource.ResourceInitializationException;
042import org.cleartk.opennlp.tools.parser.CasPosTagger;
043import org.cleartk.opennlp.tools.parser.DefaultOutputTypesHelper;
044import org.cleartk.opennlp.tools.parser.InputTypesHelper;
045import org.cleartk.opennlp.tools.parser.OutputTypesHelper;
046import org.cleartk.opennlp.tools.parser.Parser;
047import org.cleartk.opennlp.tools.parser.ParserWrapper_ImplBase;
048import org.cleartk.util.IoUtil;
049import org.cleartk.util.ParamUtil;
050import org.apache.uima.fit.descriptor.ConfigurationParameter;
051import org.apache.uima.fit.factory.AnalysisEngineFactory;
052import org.apache.uima.fit.factory.initializable.InitializableFactory;
053
054import com.google.common.annotations.Beta;
055
056/**
057 * <br>
058 * Copyright (c) 2010, Regents of the University of Colorado <br>
059 * All rights reserved.
060 * 
061 * This class provides a uima wrapper for the OpenNLP chunking parser that is specific to the
062 * ClearTK type system found in the cleartk-syntax project. However, by specifying your own
063 * implementations of {@link InputTypesHelper} and {@link OutputTypesHelper} you can use your own
064 * types for sentences, tokens, and part-of-speech tags.
065 * <p>
066 * The default behavior of the OpenNLP chunking parser is to perform part-of-speech tagging in
067 * addition to syntactic parsing. This may not be desirable in some situations where you have
068 * already populated the CAS with part-of-speech tags. In such cases you can set the configuration
069 * parameter with the name {@link #PARAM_USE_TAGS_FROM_CAS} to "true". This will bypass the
070 * OpenNLP's part-of-speech tagger and instead use tags from the CAS as defined by the
071 * implementation of {@link InputTypesHelper} you are using.
072 * 
073 * @author Philipp Wetzler, Philip Ogren.
074 */
075@Beta
076public class ParserAnnotator<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation, TOP_NODE_TYPE extends Annotation>
077    extends ParserWrapper_ImplBase<TOKEN_TYPE, SENTENCE_TYPE, Parse, TOP_NODE_TYPE> {
078
079  public static final String DEFAULT_PARSER_MODEL_PATH = "/models/en-parser-chunking.bin";
080
081  public static final int DEFAULT_BEAM_SIZE = AbstractBottomUpParser.defaultBeamSize;
082
083  public static final float DEFAULT_ADVANCE_PERCENTAGE = (float) AbstractBottomUpParser.defaultAdvancePercentage;
084
085  public static final String PARAM_PARSER_MODEL_PATH = "parserModelPath";
086
087  @ConfigurationParameter(
088      name = PARAM_PARSER_MODEL_PATH,
089      defaultValue = DEFAULT_PARSER_MODEL_PATH,
090      description = "provides the path of the OpenNLP parser model build file, e.g. /models/en-parser-chunking.bin.  See javadoc for opennlp.tools.parser.chunking.Parser.")
091  private String parserModelPath;
092
093  public static final String PARAM_BEAM_SIZE = "beamSize";
094
095  @ConfigurationParameter(
096      name = PARAM_BEAM_SIZE,
097      defaultValue = "" + DEFAULT_BEAM_SIZE,
098      description = "indicates the beam size that should be used in the parser's search.  See javadoc for opennlp.tools.parser.chunking.Parser.")
099  private int beamSize;
100
101  public static final String PARAM_ADVANCE_PERCENTAGE = "advancePercentage";
102
103  @ConfigurationParameter(
104      name = PARAM_ADVANCE_PERCENTAGE,
105      defaultValue = "" + AbstractBottomUpParser.defaultAdvancePercentage,
106      description = "indicates \"the amount of probability mass required of advanced outcomes\".  See javadoc for opennlp.tools.parser.chunking.Parser.")
107  private float advancePercentage;
108
109  public static final String PARAM_USE_TAGS_FROM_CAS = "useTagsFromCas";
110
111  @ConfigurationParameter(
112      name = PARAM_USE_TAGS_FROM_CAS,
113      defaultValue = "false",
114      description = "determines whether or not part-of-speech tags that are already in the CAS will be used or not.")
115  private boolean useTagsFromCas;
116
117  protected Parser parser;
118
119  protected CasPosTagger<TOKEN_TYPE, SENTENCE_TYPE> casTagger;
120
121  @SuppressWarnings("unchecked")
122  @Override
123  public void initialize(UimaContext ctx) throws ResourceInitializationException {
124    super.initialize(ctx);
125
126    inputTypesHelper = InitializableFactory.create(
127        ctx,
128        inputTypesHelperClassName,
129        InputTypesHelper.class);
130
131    try {
132      InputStream modelInputStream = IoUtil.getInputStream(ParserAnnotator.class, parserModelPath);
133      ParserModel parserModel = new ParserModel(modelInputStream);
134      if (useTagsFromCas) {
135        this.casTagger = new CasPosTagger<TOKEN_TYPE, SENTENCE_TYPE>(inputTypesHelper);
136        this.parser = new Parser(parserModel, beamSize, advancePercentage, casTagger);
137      } else {
138        this.parser = new Parser(parserModel, beamSize, advancePercentage);
139      }
140    } catch (IOException e) {
141      throw new ResourceInitializationException(e);
142    }
143  }
144
145  @Override
146  public void process(JCas jCas) throws AnalysisEngineProcessException {
147    String text = jCas.getDocumentText();
148
149    List<SENTENCE_TYPE> sentenceList = inputTypesHelper.getSentences(jCas);
150
151    for (SENTENCE_TYPE sentence : sentenceList) {
152
153      Parse parse = new Parse(
154          text,
155          new Span(sentence.getBegin(), sentence.getEnd()),
156          AbstractBottomUpParser.INC_NODE,
157          1,
158          null);
159
160      List<TOKEN_TYPE> tokenList = inputTypesHelper.getTokens(jCas, sentence);
161
162      for (TOKEN_TYPE token : tokenList) {
163        parse.insert(new Parse(
164            text,
165            new Span(token.getBegin(), token.getEnd()),
166            AbstractBottomUpParser.TOK_NODE,
167            0,
168            0));
169      }
170
171      if (useTagsFromCas) {
172        this.casTagger.setTokens(tokenList);
173      }
174
175      parse = this.parser.parse(parse);
176
177      // if the sentence was successfully parsed, add the tree to the
178      // sentence
179      if (parse.getType() == AbstractBottomUpParser.TOP_NODE) {
180        outputTypesHelper.addParse(jCas, parse, sentence, tokenList);
181      }
182
183      // add the POS tags to the tokens
184      if (!useTagsFromCas) {
185        setPOSTags(parse, tokenList.iterator(), jCas);
186      }
187    }
188  }
189
190  protected void setPOSTags(Parse p, Iterator<TOKEN_TYPE> tokenIterator, JCas view) {
191    if (p.isPosTag()) {
192      TOKEN_TYPE t = tokenIterator.next();
193      inputTypesHelper.setPosTag(t, p.getType());
194    } else {
195      for (Parse child : p.getChildren()) {
196        setPOSTags(child, tokenIterator, view);
197      }
198    }
199  }
200
201  public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
202    return AnalysisEngineFactory.createEngineDescription(
203        ParserAnnotator.class,
204        PARAM_PARSER_MODEL_PATH,
205        ParamUtil.getParameterValue(PARAM_PARSER_MODEL_PATH, "/models/en-parser-chunking.bin"),
206        PARAM_OUTPUT_TYPES_HELPER_CLASS_NAME,
207        DefaultOutputTypesHelper.class.getName());
208  }
209}