001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.opennlp.tools;
025
026import java.io.InputStream;
027import java.lang.reflect.Constructor;
028import java.util.ArrayList;
029import java.util.Collections;
030import java.util.List;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import opennlp.tools.sentdetect.SentenceDetector;
035import opennlp.tools.sentdetect.SentenceDetectorME;
036import opennlp.tools.sentdetect.SentenceModel;
037import opennlp.tools.util.Span;
038
039import org.apache.uima.UimaContext;
040import org.apache.uima.analysis_engine.AnalysisEngineDescription;
041import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
042import org.apache.uima.jcas.JCas;
043import org.apache.uima.jcas.tcas.Annotation;
044import org.apache.uima.resource.ResourceInitializationException;
045import org.cleartk.util.IoUtil;
046import org.cleartk.util.ParamUtil;
047import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
048import org.apache.uima.fit.descriptor.ConfigurationParameter;
049import org.apache.uima.fit.descriptor.TypeCapability;
050import org.apache.uima.fit.factory.AnalysisEngineFactory;
051import org.apache.uima.fit.factory.initializable.InitializableFactory;
052import org.apache.uima.fit.util.JCasUtil;
053
054import com.google.common.annotations.Beta;
055
056/**
057 * <br>
058 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
059 * All rights reserved.
060 * 
061 * 
062 * @author Philip Ogren
063 * @author Lee Becker
064 * 
065 *         This sentence segmenter is a simple wrapper around the OpenNLP SentenceDetector with
066 *         additional sentence detection added that handles multiple newlines (i.e. if multiple
067 *         newlines appear (separated only by whitespace) together, then this is treated as a
068 *         sentence delimiter.
069 * 
070 * @see SentenceDetector
071 */
072@Beta
073@TypeCapability(outputs = "org.cleartk.token.type.Sentence")
074public class SentenceAnnotator extends JCasAnnotator_ImplBase {
075
076  public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
077    return AnalysisEngineFactory.createEngineDescription(
078        SentenceAnnotator.class,
079        PARAM_SENTENCE_MODEL_PATH,
080        ParamUtil.getParameterValue(PARAM_SENTENCE_MODEL_PATH, "/models/en-sent.bin"),
081        PARAM_WINDOW_CLASS_NAMES,
082        ParamUtil.getParameterValue(PARAM_WINDOW_CLASS_NAMES, null));
083  }
084
085  public static final String PARAM_SENTENCE_MODEL_PATH = "sentenceModelPath";
086
087  @ConfigurationParameter(
088      name = PARAM_SENTENCE_MODEL_PATH,
089      mandatory = true,
090      description = "provides the path of the OpenNLP sentence segmenter model file")
091  private String sentenceModelPath;
092
093  public static final String PARAM_SENTENCE_TYPE_NAME = "sentenceTypeName";
094
095  public static final String PARAM_WINDOW_CLASS_NAMES = "windowClassNames";
096
097  @ConfigurationParameter(
098      name = PARAM_WINDOW_CLASS_NAMES,
099      mandatory = false,
100      description = "provides an array of the annotation types that will be processed by this sentence annotator.  If the parameter is not filled, then SentenceAnnotator will process on the contents of jCas.getDocumentText().  It us up to the caller to ensure annotations do not overlap.")
101  private String[] windowClassNames;
102
103  @ConfigurationParameter(
104      name = PARAM_SENTENCE_TYPE_NAME,
105      description = "class type of the sentences that are created by this annotator. If this parameter is not filled, then sentencesof type org.cleartk.type.Sentence will be created.",
106      defaultValue = "org.cleartk.token.type.Sentence")
107  private String sentenceTypeName;
108
109  Class<? extends Annotation> sentenceClass;
110
111  protected List<Class<? extends Annotation>> windowClasses;
112
113  Constructor<? extends Annotation> sentenceConstructor;
114
115  public static final String multipleNewlinesRegex = "\\s*\\n\\s*\\n\\s*";
116
117  SentenceDetector sentenceDetector;
118
119  Pattern multipleNewlinesPattern;
120
121  Pattern leadingWhitespacePattern;
122
123  Pattern trailingWhitespacePattern;
124
125  @Override
126  public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
127    super.initialize(uimaContext);
128
129    try {
130      sentenceClass = InitializableFactory.getClass(sentenceTypeName, Annotation.class);
131      sentenceConstructor = sentenceClass.getConstructor(new Class[] {
132          JCas.class,
133          Integer.TYPE,
134          Integer.TYPE });
135
136      if (windowClassNames != null && windowClassNames.length > 0) {
137        windowClasses = new ArrayList<Class<? extends Annotation>>();
138        for (String windowClassName : windowClassNames) {
139          windowClasses.add(InitializableFactory.getClass(windowClassName, Annotation.class));
140        }
141      }
142
143      InputStream modelInputStream = IoUtil.getInputStream(
144          SentenceAnnotator.class,
145          sentenceModelPath);
146      SentenceModel model = new SentenceModel(modelInputStream);
147      sentenceDetector = new SentenceDetectorME(model);
148      multipleNewlinesPattern = Pattern.compile(multipleNewlinesRegex, Pattern.MULTILINE
149          | Pattern.DOTALL);
150      leadingWhitespacePattern = Pattern.compile("^\\s+");
151      trailingWhitespacePattern = Pattern.compile("\\s+$");
152    } catch (Exception e) {
153      throw new ResourceInitializationException(e);
154    }
155  }
156
157  @Override
158  public void process(JCas jCas) throws AnalysisEngineProcessException {
159    if (windowClasses == null) {
160      // No window class names are set, operate on the entirety of the CAS' document text
161      String text = jCas.getDocumentText();
162      processText(jCas, text, 0);
163    } else {
164      // Window class names are specified, iterate over all annotations of the specified classes
165      for (Class<? extends Annotation> windowClass : windowClasses) {
166        // make list copy to avoid concurrent modification
167        for (Annotation window : new ArrayList<Annotation>(JCasUtil.select(jCas, windowClass))) {
168          String text = window.getCoveredText();
169          processText(jCas, text, window.getBegin());
170        }
171      }
172    }
173  }
174
175  protected void processText(JCas jCas, String text, int textOffset)
176      throws AnalysisEngineProcessException {
177    List<Integer> sentenceOffsets = getSentenceOffsets(text);
178
179    int begin = 0;
180    int end = 0;
181
182    // advance the first sentence to first non-whitespace char
183    Matcher matcher;
184    matcher = leadingWhitespacePattern.matcher(text);
185    if (matcher.find()) {
186      begin += matcher.group().length();
187    }
188    try {
189      for (Integer offset : sentenceOffsets) {
190        end = offset; // offset is really the beginning of the next sentence so we may adjust this
191                      // below.
192        String sentenceText = text.substring(begin, end);
193        // it is possible that there is a duplicate offset or that getSentenceOffsets returned the
194        // first sentence offset
195        if (sentenceText.trim().length() > 0) {
196          matcher = trailingWhitespacePattern.matcher(sentenceText);
197          if (matcher.find()) {
198            end -= matcher.group().length();
199          }
200          sentenceConstructor.newInstance(jCas, textOffset + begin, textOffset + end).addToIndexes();
201        }
202        begin = offset; // we need to advance begin regardless of whether a sentence was created.
203      }
204      // take the remaining text if there is any and add it to a sentence.
205      // this code will not execute if the text ends with a sentence detected by
206      // SentenceDetector because it actually returns an offset corresponding to the end
207      // of the last sentence (see note on getSentenceOffsets)
208      if (begin < text.length()) {
209        String sentenceText = text.substring(begin, text.length());
210        end = text.length();
211        if (sentenceText.trim().length() > 0) {
212          matcher = trailingWhitespacePattern.matcher(sentenceText);
213          if (matcher.find()) {
214            end -= matcher.group().length();
215          }
216          sentenceConstructor.newInstance(jCas, textOffset + begin, textOffset + end).addToIndexes();
217        }
218      }
219    } catch (Exception e) {
220      throw new AnalysisEngineProcessException(e);
221    }
222  }
223
224  /**
225   * returns a list of the beginnings of sentences - except (possibly) the first sentence - from
226   * both the OpenNLP sentence detector and the multiple newlines regex. It is possible for this
227   * method to return duplicate values. The sentence detector will return an offset corresponding to
228   * the end of the text if the last non-whitespace character was classified an end of sentence
229   * character.
230   */
231  private List<Integer> getSentenceOffsets(String text) {
232    Matcher matcher = multipleNewlinesPattern.matcher(text);
233    List<Integer> offsets = new ArrayList<Integer>();
234    while (matcher.find()) {
235      offsets.add(matcher.end());
236    }
237
238    Span[] sentenceOffsetsML = sentenceDetector.sentPosDetect(text);
239    for (int i = 0; i < sentenceOffsetsML.length; i++) {
240      offsets.add(sentenceOffsetsML[i].getStart());
241    }
242    Collections.sort(offsets);
243    return offsets;
244  }
245
246}