001/* 002 * Copyright (c) 2010, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024 025package org.cleartk.token.breakit; 026 027import java.lang.reflect.Constructor; 028import java.text.BreakIterator; 029import java.util.Locale; 030 031import org.apache.uima.UimaContext; 032import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 033import org.apache.uima.jcas.JCas; 034import org.apache.uima.jcas.tcas.Annotation; 035import org.apache.uima.resource.ResourceInitializationException; 036import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 037import org.apache.uima.fit.descriptor.ConfigurationParameter; 038import org.apache.uima.fit.factory.initializable.InitializableFactory; 039 040/** 041 * <br> 042 * Copyright (c) 2010, Regents of the University of Colorado <br> 043 * All rights reserved. 044 * <p> 045 * 046 * @author Philip Ogren 047 */ 048 049public class BreakIteratorAnnotator extends JCasAnnotator_ImplBase { 050 051 public static final String PARAM_LOCALE = "locale"; 052 053 @ConfigurationParameter( 054 name = PARAM_LOCALE, 055 description = "provides the name of the locale to be used to instantiate the break iterator") 056 private Locale locale; 057 058 public static enum BreakIteratorType { 059 WORD, SENTENCE 060 } 061 062 public static final String PARAM_BREAK_ITERATOR_TYPE = "breakIteratorType"; 063 064 @ConfigurationParameter( 065 name = PARAM_BREAK_ITERATOR_TYPE, 066 description = "provides the type of the locale to be used to instantiate the break iterator. Should be one of 'WORD' or 'SENTENCE'", defaultValue = "SENTENCE") 067 private BreakIteratorType breakIteratorType; 068 069 public static final String PARAM_ANNOTATION_TYPE_NAME = "annotationTypeName"; 070 071 @ConfigurationParameter(name = PARAM_ANNOTATION_TYPE_NAME, 072 description = "class type of the annotations that are created by this annotator.") 073 private String annotationTypeName; 074 075 private Class<? extends Annotation> annotationClass; 076 077 private Constructor<? extends Annotation> annotationConstructor; 078 079 private BreakIterator breakIterator; 080 081 @Override 082 public void initialize(UimaContext context) throws ResourceInitializationException { 083 super.initialize(context); 084 085 try { 086 annotationClass = InitializableFactory.getClass(annotationTypeName, Annotation.class); 087 annotationConstructor = annotationClass.getConstructor(new Class[] { 088 JCas.class, 089 Integer.TYPE, 090 Integer.TYPE }); 091 } catch (Exception e) { 092 throw new ResourceInitializationException(e); 093 } 094 095 if (locale == null) { 096 locale = Locale.getDefault(); 097 } 098 if (breakIteratorType == BreakIteratorType.WORD) { 099 breakIterator = BreakIterator.getWordInstance(locale); 100 } else { 101 breakIterator = BreakIterator.getSentenceInstance(locale); 102 } 103 104 } 105 106 @Override 107 public void process(JCas jCas) throws AnalysisEngineProcessException { 108 String text = jCas.getDocumentText(); 109 breakIterator.setText(text); 110 111 int index = breakIterator.first(); 112 int endIndex; 113 while ((endIndex = breakIterator.next()) != BreakIterator.DONE) { 114 String annotationText = text.substring(index, endIndex); 115 if (!annotationText.trim().equals("")) { 116 try { 117 annotationConstructor.newInstance(jCas, index, endIndex).addToIndexes(); 118 } catch (Exception e) { 119 throw new AnalysisEngineProcessException(e); 120 } 121 } 122 index = endIndex; 123 } 124 } 125 126}