001/** 
002 * Copyright (c) 2007-2008, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.token.tokenizer;
025
026import java.lang.reflect.Constructor;
027import java.lang.reflect.InvocationTargetException;
028import java.util.List;
029
030import org.apache.uima.UimaContext;
031import org.apache.uima.analysis_engine.AnalysisEngineDescription;
032import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
033import org.apache.uima.cas.FSIterator;
034import org.apache.uima.cas.Type;
035import org.apache.uima.jcas.JCas;
036import org.apache.uima.jcas.tcas.Annotation;
037import org.apache.uima.resource.ResourceInitializationException;
038import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
039import org.apache.uima.fit.descriptor.ConfigurationParameter;
040import org.apache.uima.fit.factory.AnalysisEngineFactory;
041import org.apache.uima.fit.factory.initializable.InitializableFactory;
042import org.apache.uima.fit.util.JCasUtil;
043
044/**
045 * <br>
046 * Copyright (c) 2007-2008, Regents of the University of Colorado <br>
047 * All rights reserved.
048 * 
049 * <p>
050 * 
051 * @author Philip Ogren
052 * 
053 */
054public class TokenAnnotator extends JCasAnnotator_ImplBase {
055
056  public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
057    return AnalysisEngineFactory.createEngineDescription(TokenAnnotator.class);
058  }
059
060  public static final String PARAM_TOKENIZER_NAME = "tokenizerName";
061
062  private static final String TOKENIZER_DESCRIPTION = "specifies the class type of the tokenizer that will be used by this annotator. "
063      + "If this parameter is not filled, then the default tokenenizer (org.cleartk.token.util.PennTreebankTokenizer) is used. "
064      + "A tokenenizer is defined as any implementation of the interface defined by org.cleartk.token.util.Tokenizer.";
065
066  @ConfigurationParameter(
067      name = PARAM_TOKENIZER_NAME,
068      description = TOKENIZER_DESCRIPTION,
069      defaultValue = "org.cleartk.token.tokenizer.PennTreebankTokenizer")
070  private String tokenizerName;
071
072  public static final String PARAM_TOKEN_TYPE_NAME = "tokenTypeName";
073
074  @ConfigurationParameter(
075      name = PARAM_TOKEN_TYPE_NAME,
076      description = "class type of the tokens that are created by this annotator. If this parameter is not filled, then tokens of type org.cleartk.token.type.Token will be created.",
077      defaultValue = "org.cleartk.token.type.Token")
078  private String tokenTypeName;
079
080  public static final String PARAM_WINDOW_TYPE_NAME = "windowTypeName";
081
082  private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. "
083      + "By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization with"
084      + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
085
086  @ConfigurationParameter(
087      name = PARAM_WINDOW_TYPE_NAME,
088      description = WINDOW_TYPE_DESCRIPTION,
089      defaultValue = "org.cleartk.token.type.Sentence")
090  private String windowTypeName;
091
092  Tokenizer tokenizer;
093
094  Class<? extends Annotation> tokenClass;
095
096  Constructor<? extends Annotation> tokenConstructor;
097
098  private Class<? extends Annotation> windowClass;
099
100  private Type windowType = null;
101
102  private boolean typesInitialized = false;
103
104  public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
105    try {
106      super.initialize(uimaContext);
107      tokenizer = InitializableFactory.create(uimaContext, tokenizerName, Tokenizer.class);
108      tokenClass = InitializableFactory.getClass(tokenTypeName, Annotation.class);
109      tokenConstructor = tokenClass.getConstructor(new Class[] {
110          JCas.class,
111          Integer.TYPE,
112          Integer.TYPE });
113      if (windowTypeName != null)
114        windowClass = InitializableFactory.getClass(windowTypeName, Annotation.class);
115    } catch (Exception e) {
116      throw new ResourceInitializationException(e);
117    }
118  }
119
120  private void initializeTypes(JCas jCas) {
121    if (windowClass != null) {
122      windowType = JCasUtil.getType(jCas, windowClass);
123    }
124    typesInitialized = true;
125  }
126
127  public void process(JCas jCas) throws AnalysisEngineProcessException {
128    try {
129      if (!typesInitialized)
130        initializeTypes(jCas);
131      if (windowType != null) {
132        FSIterator<Annotation> windows = jCas.getAnnotationIndex(windowType).iterator();
133        while (windows.hasNext()) {
134          Annotation window = windows.next();
135          List<Token> pojoTokens = tokenizer.getTokens(window.getCoveredText());
136          createTokens(pojoTokens, window.getBegin(), jCas);
137        }
138      } else {
139        String text = jCas.getDocumentText();
140        List<Token> pojoTokens = tokenizer.getTokens(text);
141        createTokens(pojoTokens, 0, jCas);
142      }
143    } catch (Exception e) {
144      throw new AnalysisEngineProcessException(e);
145    }
146  }
147
148  private void createTokens(List<Token> pojoTokens, int offset, JCas jCas)
149      throws InstantiationException, InvocationTargetException, IllegalAccessException {
150    for (Token pojoToken : pojoTokens) {
151      int tokenBegin = pojoToken.getBegin() + offset;
152      int tokenEnd = pojoToken.getEnd() + offset;
153      tokenConstructor.newInstance(jCas, tokenBegin, tokenEnd).addToIndexes();
154    }
155  }
156
157  public void setTokenizerName(String tokenizerName) {
158    this.tokenizerName = tokenizerName;
159  }
160
161  public void setTokenTypeName(String tokenTypeName) {
162    this.tokenTypeName = tokenTypeName;
163  }
164
165  public void setWindowTypeName(String windowTypeName) {
166    this.windowTypeName = windowTypeName;
167  }
168
169}