001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.opennlp.tools; 025 026import java.io.InputStream; 027import java.lang.reflect.Constructor; 028import java.util.ArrayList; 029import java.util.Collections; 030import java.util.List; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import opennlp.tools.sentdetect.SentenceDetector; 035import opennlp.tools.sentdetect.SentenceDetectorME; 036import opennlp.tools.sentdetect.SentenceModel; 037import opennlp.tools.util.Span; 038 039import org.apache.uima.UimaContext; 040import org.apache.uima.analysis_engine.AnalysisEngineDescription; 041import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 042import org.apache.uima.jcas.JCas; 043import org.apache.uima.jcas.tcas.Annotation; 044import org.apache.uima.resource.ResourceInitializationException; 045import org.cleartk.util.IoUtil; 046import org.cleartk.util.ParamUtil; 047import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 048import org.apache.uima.fit.descriptor.ConfigurationParameter; 049import org.apache.uima.fit.descriptor.TypeCapability; 050import org.apache.uima.fit.factory.AnalysisEngineFactory; 051import org.apache.uima.fit.factory.initializable.InitializableFactory; 052import org.apache.uima.fit.util.JCasUtil; 053 054import com.google.common.annotations.Beta; 055 056/** 057 * <br> 058 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 059 * All rights reserved. 060 * 061 * 062 * @author Philip Ogren 063 * @author Lee Becker 064 * 065 * This sentence segmenter is a simple wrapper around the OpenNLP SentenceDetector with 066 * additional sentence detection added that handles multiple newlines (i.e. if multiple 067 * newlines appear (separated only by whitespace) together, then this is treated as a 068 * sentence delimiter. 069 * 070 * @see SentenceDetector 071 */ 072@Beta 073@TypeCapability(outputs = "org.cleartk.token.type.Sentence") 074public class SentenceAnnotator extends JCasAnnotator_ImplBase { 075 076 public static AnalysisEngineDescription getDescription() throws ResourceInitializationException { 077 return AnalysisEngineFactory.createEngineDescription( 078 SentenceAnnotator.class, 079 PARAM_SENTENCE_MODEL_PATH, 080 ParamUtil.getParameterValue(PARAM_SENTENCE_MODEL_PATH, "/models/en-sent.bin"), 081 PARAM_WINDOW_CLASS_NAMES, 082 ParamUtil.getParameterValue(PARAM_WINDOW_CLASS_NAMES, null)); 083 } 084 085 public static final String PARAM_SENTENCE_MODEL_PATH = "sentenceModelPath"; 086 087 @ConfigurationParameter( 088 name = PARAM_SENTENCE_MODEL_PATH, 089 mandatory = true, 090 description = "provides the path of the OpenNLP sentence segmenter model file") 091 private String sentenceModelPath; 092 093 public static final String PARAM_SENTENCE_TYPE_NAME = "sentenceTypeName"; 094 095 public static final String PARAM_WINDOW_CLASS_NAMES = "windowClassNames"; 096 097 @ConfigurationParameter( 098 name = PARAM_WINDOW_CLASS_NAMES, 099 mandatory = false, 100 description = "provides an array of the annotation types that will be processed by this sentence annotator. If the parameter is not filled, then SentenceAnnotator will process on the contents of jCas.getDocumentText(). It us up to the caller to ensure annotations do not overlap.") 101 private String[] windowClassNames; 102 103 @ConfigurationParameter( 104 name = PARAM_SENTENCE_TYPE_NAME, 105 description = "class type of the sentences that are created by this annotator. If this parameter is not filled, then sentencesof type org.cleartk.type.Sentence will be created.", 106 defaultValue = "org.cleartk.token.type.Sentence") 107 private String sentenceTypeName; 108 109 Class<? extends Annotation> sentenceClass; 110 111 protected List<Class<? extends Annotation>> windowClasses; 112 113 Constructor<? extends Annotation> sentenceConstructor; 114 115 public static final String multipleNewlinesRegex = "\\s*\\n\\s*\\n\\s*"; 116 117 SentenceDetector sentenceDetector; 118 119 Pattern multipleNewlinesPattern; 120 121 Pattern leadingWhitespacePattern; 122 123 Pattern trailingWhitespacePattern; 124 125 @Override 126 public void initialize(UimaContext uimaContext) throws ResourceInitializationException { 127 super.initialize(uimaContext); 128 129 try { 130 sentenceClass = InitializableFactory.getClass(sentenceTypeName, Annotation.class); 131 sentenceConstructor = sentenceClass.getConstructor(new Class[] { 132 JCas.class, 133 Integer.TYPE, 134 Integer.TYPE }); 135 136 if (windowClassNames != null && windowClassNames.length > 0) { 137 windowClasses = new ArrayList<Class<? extends Annotation>>(); 138 for (String windowClassName : windowClassNames) { 139 windowClasses.add(InitializableFactory.getClass(windowClassName, Annotation.class)); 140 } 141 } 142 143 InputStream modelInputStream = IoUtil.getInputStream( 144 SentenceAnnotator.class, 145 sentenceModelPath); 146 SentenceModel model = new SentenceModel(modelInputStream); 147 sentenceDetector = new SentenceDetectorME(model); 148 multipleNewlinesPattern = Pattern.compile(multipleNewlinesRegex, Pattern.MULTILINE 149 | Pattern.DOTALL); 150 leadingWhitespacePattern = Pattern.compile("^\\s+"); 151 trailingWhitespacePattern = Pattern.compile("\\s+$"); 152 } catch (Exception e) { 153 throw new ResourceInitializationException(e); 154 } 155 } 156 157 @Override 158 public void process(JCas jCas) throws AnalysisEngineProcessException { 159 if (windowClasses == null) { 160 // No window class names are set, operate on the entirety of the CAS' document text 161 String text = jCas.getDocumentText(); 162 processText(jCas, text, 0); 163 } else { 164 // Window class names are specified, iterate over all annotations of the specified classes 165 for (Class<? extends Annotation> windowClass : windowClasses) { 166 // make list copy to avoid concurrent modification 167 for (Annotation window : new ArrayList<Annotation>(JCasUtil.select(jCas, windowClass))) { 168 String text = window.getCoveredText(); 169 processText(jCas, text, window.getBegin()); 170 } 171 } 172 } 173 } 174 175 protected void processText(JCas jCas, String text, int textOffset) 176 throws AnalysisEngineProcessException { 177 List<Integer> sentenceOffsets = getSentenceOffsets(text); 178 179 int begin = 0; 180 int end = 0; 181 182 // advance the first sentence to first non-whitespace char 183 Matcher matcher; 184 matcher = leadingWhitespacePattern.matcher(text); 185 if (matcher.find()) { 186 begin += matcher.group().length(); 187 } 188 try { 189 for (Integer offset : sentenceOffsets) { 190 end = offset; // offset is really the beginning of the next sentence so we may adjust this 191 // below. 192 String sentenceText = text.substring(begin, end); 193 // it is possible that there is a duplicate offset or that getSentenceOffsets returned the 194 // first sentence offset 195 if (sentenceText.trim().length() > 0) { 196 matcher = trailingWhitespacePattern.matcher(sentenceText); 197 if (matcher.find()) { 198 end -= matcher.group().length(); 199 } 200 sentenceConstructor.newInstance(jCas, textOffset + begin, textOffset + end).addToIndexes(); 201 } 202 begin = offset; // we need to advance begin regardless of whether a sentence was created. 203 } 204 // take the remaining text if there is any and add it to a sentence. 205 // this code will not execute if the text ends with a sentence detected by 206 // SentenceDetector because it actually returns an offset corresponding to the end 207 // of the last sentence (see note on getSentenceOffsets) 208 if (begin < text.length()) { 209 String sentenceText = text.substring(begin, text.length()); 210 end = text.length(); 211 if (sentenceText.trim().length() > 0) { 212 matcher = trailingWhitespacePattern.matcher(sentenceText); 213 if (matcher.find()) { 214 end -= matcher.group().length(); 215 } 216 sentenceConstructor.newInstance(jCas, textOffset + begin, textOffset + end).addToIndexes(); 217 } 218 } 219 } catch (Exception e) { 220 throw new AnalysisEngineProcessException(e); 221 } 222 } 223 224 /** 225 * returns a list of the beginnings of sentences - except (possibly) the first sentence - from 226 * both the OpenNLP sentence detector and the multiple newlines regex. It is possible for this 227 * method to return duplicate values. The sentence detector will return an offset corresponding to 228 * the end of the text if the last non-whitespace character was classified an end of sentence 229 * character. 230 */ 231 private List<Integer> getSentenceOffsets(String text) { 232 Matcher matcher = multipleNewlinesPattern.matcher(text); 233 List<Integer> offsets = new ArrayList<Integer>(); 234 while (matcher.find()) { 235 offsets.add(matcher.end()); 236 } 237 238 Span[] sentenceOffsetsML = sentenceDetector.sentPosDetect(text); 239 for (int i = 0; i < sentenceOffsetsML.length; i++) { 240 offsets.add(sentenceOffsetsML[i].getStart()); 241 } 242 Collections.sort(offsets); 243 return offsets; 244 } 245 246}