001/*
002 * Copyright (c) 2012, Regents of the University of Colorado 
003 * All rights reserved.
004 * 
005 * Redistribution and use in source and binary forms, with or without
006 * modification, are permitted provided that the following conditions are met:
007 * 
008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 
011 * 
012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
022 * POSSIBILITY OF SUCH DAMAGE. 
023 */
024package org.cleartk.clearnlp;
025
026import java.util.List;
027
028import org.apache.uima.UimaContext;
029import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
030import org.apache.uima.jcas.JCas;
031import org.apache.uima.jcas.tcas.Annotation;
032import org.apache.uima.resource.ResourceInitializationException;
033import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
034import org.apache.uima.fit.descriptor.ConfigurationParameter;
035import org.apache.uima.fit.util.JCasUtil;
036
037import com.google.common.annotations.Beta;
038import com.clearnlp.component.AbstractComponent;
039import com.clearnlp.dependency.DEPNode;
040import com.clearnlp.dependency.DEPTree;
041import com.clearnlp.nlp.NLPGetter;
042import com.clearnlp.nlp.NLPLib;
043import com.clearnlp.reader.AbstractReader;
044
045/**
046 * <br>
047 * Copyright (c) 2012, Regents of the University of Colorado <br>
048 * All rights reserved.
049 * <p>
050 * This class provides a UIMA/ClearTK wrapper for the ClearNLP part of speech (POS) tagger. This
051 * engine requires tokenize input and produces POS tags on the tokens.
052 * 
053 * Subclasses should override the abstract methods to produce the annotations relevant for the
054 * target type system.
055 * 
056 * This tagger is available here:
057 * <p>
058 * http://clearnlp.googlecode.com
059 * <p>
060 * 
061 * @author Lee Becker
062 * 
063 */
064@Beta
065public abstract class PosTagger_ImplBase<TOKEN_TYPE extends Annotation> extends
066    JCasAnnotator_ImplBase {
067
068  public static final String DEFAULT_MODEL_PATH = "general-en";
069  
070  public static final String PARAM_MODEL_PATH = "modelPath";
071  
072  @ConfigurationParameter(
073      name = PARAM_MODEL_PATH,
074      mandatory = false,
075      description = "This parameter provides the path to the pos tagger model.",
076      defaultValue=DEFAULT_MODEL_PATH)
077  private String modelPath;
078
079  
080  /*
081  public static final String PARAM_MODEL_URI = "modelUri";
082
083  @ConfigurationParameter(
084      name = PARAM_MODEL_URI,
085      mandatory = false,
086      description = "This parameter provides the URI to the pos tagger model.")
087  private URI modelUri;
088  */
089
090  public static final String PARAM_LANGUAGE_CODE = "languageCode";
091
092  @ConfigurationParameter(
093      name = PARAM_LANGUAGE_CODE,
094      mandatory = false,
095      description = "Language code for the pos tagger (default value=en).",
096      defaultValue = AbstractReader.LANG_EN)
097  private String languageCode;
098
099  public static final String PARAM_WINDOW_CLASS = "windowClass";
100
101  private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. "
102      + "By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization with"
103      + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
104
105  @ConfigurationParameter(
106      name = PARAM_WINDOW_CLASS,
107      mandatory = false,
108      description = WINDOW_TYPE_DESCRIPTION,
109      defaultValue = "org.cleartk.token.type.Sentence")
110  private Class<? extends Annotation> windowClass;
111
112  private TokenOps<TOKEN_TYPE> tokenOps;
113
114  public PosTagger_ImplBase(TokenOps<TOKEN_TYPE> tokenOps) {
115    this.tokenOps = tokenOps;
116  }
117
118  @Override
119  public void initialize(UimaContext context) throws ResourceInitializationException {
120    super.initialize(context);
121    try {
122      // Load POS tagger model
123      this.tagger = NLPGetter.getComponent(modelPath, languageCode, NLPLib.MODE_POS);
124
125    } catch (Exception e) {
126      throw new ResourceInitializationException(e);
127    }
128
129  }
130
131  @Override
132  public void process(JCas jCas) throws AnalysisEngineProcessException {
133
134    for (Annotation window : JCasUtil.select(jCas, this.windowClass)) {
135      List<TOKEN_TYPE> tokens = this.tokenOps.selectTokens(jCas, window);
136      if (tokens.size() <= 0) {
137        return;
138      }
139
140      List<String> tokenStrings = JCasUtil.toText(tokens);
141
142      // As of version 1.3.0, ClearNLP does all processing to go through its own dependency tree
143      // structure
144      DEPTree clearNlpDepTree = NLPGetter.toDEPTree(tokenStrings);
145      this.tagger.process(clearNlpDepTree);
146
147      // Note the ClearNLP counts index 0 as the sentence dependency node, so the POS tag indices
148      // are shifted by one from the token indices
149      for (int i = 0; i < tokens.size(); i++) {
150        TOKEN_TYPE token = tokens.get(i);
151        DEPNode node = clearNlpDepTree.get(i+1);
152        this.tokenOps.setPos(jCas, token, node.pos);
153        this.tokenOps.setLemma(jCas, token, node.lemma);
154      }
155    }
156  }
157
158  private AbstractComponent tagger;
159
160}