001/*
002 * Copyright (c) 2011, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.clearnlp;
025
026import java.net.URI;
027import java.util.List;
028
029import org.apache.uima.UimaContext;
030import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
031import org.apache.uima.jcas.JCas;
032import org.apache.uima.jcas.tcas.Annotation;
033import org.apache.uima.resource.ResourceInitializationException;
034import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
035import org.apache.uima.fit.descriptor.ConfigurationParameter;
036import org.apache.uima.fit.util.JCasUtil;
037
038import com.google.common.annotations.Beta;
039import com.clearnlp.nlp.NLPGetter;
040import com.clearnlp.reader.AbstractReader;
041import com.clearnlp.tokenization.AbstractTokenizer;
042
043/**
044 * <br>
045 * Copyright (c) 2012, Regents of the University of Colorado <br>
046 * All rights reserved.
047 * <p>
048 * This class provides a wrapper for the ClearNLP part of speech tokenizer for UIMA and/or ClearTK
049 * type systems.
050 * 
051 * Subclasses should override the abstract methods to produce the annotations relevant for the
052 * target type system.
053 * 
054 * This tagger is available here:
055 * <p>
056 * http://clearnlp.googlecode.com
057 * <p>
058 * 
059 * @author Lee Becker
060 * 
061 */
062@Beta
063public abstract class Tokenizer_ImplBase<TOKEN_TYPE extends Annotation> extends
064    JCasAnnotator_ImplBase {
065  public static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary-1.2.0.zip";
066
067  public static final String PARAM_LANGUAGE_CODE = "languageCode";
068
069  @ConfigurationParameter(
070      name = PARAM_LANGUAGE_CODE,
071      mandatory = false,
072      description = "Language code for the tokenizer (default value=en).",
073      defaultValue = AbstractReader.LANG_EN)
074  private String languageCode;
075
076  public static final String PARAM_DICTIONARY_URI = "dictionaryUri";
077
078  @ConfigurationParameter(
079      name = PARAM_DICTIONARY_URI,
080      mandatory = false,
081      description = "This parameter provides the URI of the tokenizer dictionary file.")
082  private URI dictionaryUri;
083
084  public static final String PARAM_WINDOW_CLASS = "windowClass";
085
086  private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. "
087      + "By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization with"
088      + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
089
090  @ConfigurationParameter(
091      name = PARAM_WINDOW_CLASS,
092      mandatory = false,
093      description = WINDOW_TYPE_DESCRIPTION,
094      defaultValue = "org.cleartk.token.type.Sentence")
095  private Class<? extends Annotation> windowClass;
096
097  private AbstractTokenizer tokenizer;
098
099  protected abstract TokenOps<TOKEN_TYPE> getTokenOps();
100
101  @Override
102  public void initialize(UimaContext context) throws ResourceInitializationException {
103    super.initialize(context);
104    try {
105      this.tokenizer = NLPGetter.getTokenizer(languageCode);
106    } catch (Exception e) {
107      throw new ResourceInitializationException(e);
108    }
109  }
110
111  @Override
112  public void process(JCas jCas) throws AnalysisEngineProcessException {
113    for (Annotation window : JCasUtil.select(jCas, this.windowClass)) {
114      String windowText = window.getCoveredText();
115      int windowOffset = window.getBegin();
116      List<String> tokens = tokenizer.getTokens(windowText);
117
118      int offset = 0;
119      for (String token : tokens) {
120        int tokenBegin = windowText.indexOf(token, offset);
121        int tokenEnd = tokenBegin + token.length();
122        try {
123          this.getTokenOps().createToken(jCas, windowOffset + tokenBegin, windowOffset + tokenEnd);
124        } catch (Exception e) {
125          throw new AnalysisEngineProcessException(e);
126        }
127        offset = tokenEnd;
128      }
129    }
130  }
131
132}