001/** 002 * Copyright (c) 2007-2008, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.token.tokenizer; 025 026import java.lang.reflect.Constructor; 027import java.lang.reflect.InvocationTargetException; 028import java.util.List; 029 030import org.apache.uima.UimaContext; 031import org.apache.uima.analysis_engine.AnalysisEngineDescription; 032import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 033import org.apache.uima.cas.FSIterator; 034import org.apache.uima.cas.Type; 035import org.apache.uima.jcas.JCas; 036import org.apache.uima.jcas.tcas.Annotation; 037import org.apache.uima.resource.ResourceInitializationException; 038import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 039import org.apache.uima.fit.descriptor.ConfigurationParameter; 040import org.apache.uima.fit.factory.AnalysisEngineFactory; 041import org.apache.uima.fit.factory.initializable.InitializableFactory; 042import org.apache.uima.fit.util.JCasUtil; 043 044/** 045 * <br> 046 * Copyright (c) 2007-2008, Regents of the University of Colorado <br> 047 * All rights reserved. 048 * 049 * <p> 050 * 051 * @author Philip Ogren 052 * 053 */ 054public class TokenAnnotator extends JCasAnnotator_ImplBase { 055 056 public static AnalysisEngineDescription getDescription() throws ResourceInitializationException { 057 return AnalysisEngineFactory.createEngineDescription(TokenAnnotator.class); 058 } 059 060 public static final String PARAM_TOKENIZER_NAME = "tokenizerName"; 061 062 private static final String TOKENIZER_DESCRIPTION = "specifies the class type of the tokenizer that will be used by this annotator. " 063 + "If this parameter is not filled, then the default tokenenizer (org.cleartk.token.util.PennTreebankTokenizer) is used. " 064 + "A tokenenizer is defined as any implementation of the interface defined by org.cleartk.token.util.Tokenizer."; 065 066 @ConfigurationParameter( 067 name = PARAM_TOKENIZER_NAME, 068 description = TOKENIZER_DESCRIPTION, 069 defaultValue = "org.cleartk.token.tokenizer.PennTreebankTokenizer") 070 private String tokenizerName; 071 072 public static final String PARAM_TOKEN_TYPE_NAME = "tokenTypeName"; 073 074 @ConfigurationParameter( 075 name = PARAM_TOKEN_TYPE_NAME, 076 description = "class type of the tokens that are created by this annotator. If this parameter is not filled, then tokens of type org.cleartk.token.type.Token will be created.", 077 defaultValue = "org.cleartk.token.type.Token") 078 private String tokenTypeName; 079 080 public static final String PARAM_WINDOW_TYPE_NAME = "windowTypeName"; 081 082 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 083 + "By default, the tokenizer will tokenize a document sentence by sentence. If you do not want to precede tokenization with" 084 + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'"; 085 086 @ConfigurationParameter( 087 name = PARAM_WINDOW_TYPE_NAME, 088 description = WINDOW_TYPE_DESCRIPTION, 089 defaultValue = "org.cleartk.token.type.Sentence") 090 private String windowTypeName; 091 092 Tokenizer tokenizer; 093 094 Class<? extends Annotation> tokenClass; 095 096 Constructor<? extends Annotation> tokenConstructor; 097 098 private Class<? extends Annotation> windowClass; 099 100 private Type windowType = null; 101 102 private boolean typesInitialized = false; 103 104 public void initialize(UimaContext uimaContext) throws ResourceInitializationException { 105 try { 106 super.initialize(uimaContext); 107 tokenizer = InitializableFactory.create(uimaContext, tokenizerName, Tokenizer.class); 108 tokenClass = InitializableFactory.getClass(tokenTypeName, Annotation.class); 109 tokenConstructor = tokenClass.getConstructor(new Class[] { 110 JCas.class, 111 Integer.TYPE, 112 Integer.TYPE }); 113 if (windowTypeName != null) 114 windowClass = InitializableFactory.getClass(windowTypeName, Annotation.class); 115 } catch (Exception e) { 116 throw new ResourceInitializationException(e); 117 } 118 } 119 120 private void initializeTypes(JCas jCas) { 121 if (windowClass != null) { 122 windowType = JCasUtil.getType(jCas, windowClass); 123 } 124 typesInitialized = true; 125 } 126 127 public void process(JCas jCas) throws AnalysisEngineProcessException { 128 try { 129 if (!typesInitialized) 130 initializeTypes(jCas); 131 if (windowType != null) { 132 FSIterator<Annotation> windows = jCas.getAnnotationIndex(windowType).iterator(); 133 while (windows.hasNext()) { 134 Annotation window = windows.next(); 135 List<Token> pojoTokens = tokenizer.getTokens(window.getCoveredText()); 136 createTokens(pojoTokens, window.getBegin(), jCas); 137 } 138 } else { 139 String text = jCas.getDocumentText(); 140 List<Token> pojoTokens = tokenizer.getTokens(text); 141 createTokens(pojoTokens, 0, jCas); 142 } 143 } catch (Exception e) { 144 throw new AnalysisEngineProcessException(e); 145 } 146 } 147 148 private void createTokens(List<Token> pojoTokens, int offset, JCas jCas) 149 throws InstantiationException, InvocationTargetException, IllegalAccessException { 150 for (Token pojoToken : pojoTokens) { 151 int tokenBegin = pojoToken.getBegin() + offset; 152 int tokenEnd = pojoToken.getEnd() + offset; 153 tokenConstructor.newInstance(jCas, tokenBegin, tokenEnd).addToIndexes(); 154 } 155 } 156 157 public void setTokenizerName(String tokenizerName) { 158 this.tokenizerName = tokenizerName; 159 } 160 161 public void setTokenTypeName(String tokenTypeName) { 162 this.tokenTypeName = tokenTypeName; 163 } 164 165 public void setWindowTypeName(String windowTypeName) { 166 this.windowTypeName = windowTypeName; 167 } 168 169}