001/* 
002 * Copyright (c) 2010, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024
025package org.cleartk.token.breakit;
026
027import java.lang.reflect.Constructor;
028import java.text.BreakIterator;
029import java.util.Locale;
030
031import org.apache.uima.UimaContext;
032import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
033import org.apache.uima.jcas.JCas;
034import org.apache.uima.jcas.tcas.Annotation;
035import org.apache.uima.resource.ResourceInitializationException;
036import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
037import org.apache.uima.fit.descriptor.ConfigurationParameter;
038import org.apache.uima.fit.factory.initializable.InitializableFactory;
039
040/**
041 * <br>
042 * Copyright (c) 2010, Regents of the University of Colorado <br>
043 * All rights reserved.
044 * <p>
045 * 
046 * @author Philip Ogren
047 */
048
049public class BreakIteratorAnnotator extends JCasAnnotator_ImplBase {
050
051  public static final String PARAM_LOCALE = "locale";
052
053  @ConfigurationParameter(
054      name = PARAM_LOCALE,
055      description = "provides the name of the locale to be used to instantiate the break iterator")
056  private Locale locale;
057
058  public static enum BreakIteratorType {
059    WORD, SENTENCE
060  }
061
062  public static final String PARAM_BREAK_ITERATOR_TYPE = "breakIteratorType";
063
064  @ConfigurationParameter(
065      name = PARAM_BREAK_ITERATOR_TYPE,
066      description = "provides the type of the locale to be used to instantiate the break iterator.  Should be one of  'WORD' or 'SENTENCE'", defaultValue = "SENTENCE")
067  private BreakIteratorType breakIteratorType;
068
069  public static final String PARAM_ANNOTATION_TYPE_NAME = "annotationTypeName";
070
071  @ConfigurationParameter(name = PARAM_ANNOTATION_TYPE_NAME, 
072      description = "class type of the annotations that are created by this annotator.")
073  private String annotationTypeName;
074
075  private Class<? extends Annotation> annotationClass;
076
077  private Constructor<? extends Annotation> annotationConstructor;
078
079  private BreakIterator breakIterator;
080
081  @Override
082  public void initialize(UimaContext context) throws ResourceInitializationException {
083    super.initialize(context);
084
085    try {
086      annotationClass = InitializableFactory.getClass(annotationTypeName, Annotation.class);
087      annotationConstructor = annotationClass.getConstructor(new Class[] {
088          JCas.class,
089          Integer.TYPE,
090          Integer.TYPE });
091    } catch (Exception e) {
092      throw new ResourceInitializationException(e);
093    }
094
095    if (locale == null) {
096      locale = Locale.getDefault();
097    }
098    if (breakIteratorType == BreakIteratorType.WORD) {
099      breakIterator = BreakIterator.getWordInstance(locale);
100    } else {
101      breakIterator = BreakIterator.getSentenceInstance(locale);
102    }
103
104  }
105
106  @Override
107  public void process(JCas jCas) throws AnalysisEngineProcessException {
108    String text = jCas.getDocumentText();
109    breakIterator.setText(text);
110
111    int index = breakIterator.first();
112    int endIndex;
113    while ((endIndex = breakIterator.next()) != BreakIterator.DONE) {
114      String annotationText = text.substring(index, endIndex);
115      if (!annotationText.trim().equals("")) {
116        try {
117          annotationConstructor.newInstance(jCas, index, endIndex).addToIndexes();
118        } catch (Exception e) {
119          throw new AnalysisEngineProcessException(e);
120        }
121      }
122      index = endIndex;
123    }
124  }
125
126}