001/* 002 * Copyright (c) 2011, Regents of the University of Colorado 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions are met: 007 * 008 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 009 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 010 * Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 011 * 012 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 013 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 014 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 015 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 016 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 017 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 018 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 019 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 020 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 021 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 022 * POSSIBILITY OF SUCH DAMAGE. 023 */ 024package org.cleartk.clearnlp; 025 026import java.net.URI; 027import java.util.List; 028 029import org.apache.uima.UimaContext; 030import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 031import org.apache.uima.jcas.JCas; 032import org.apache.uima.jcas.tcas.Annotation; 033import org.apache.uima.resource.ResourceInitializationException; 034import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 035import org.apache.uima.fit.descriptor.ConfigurationParameter; 036import org.apache.uima.fit.util.JCasUtil; 037 038import com.google.common.annotations.Beta; 039import com.clearnlp.nlp.NLPGetter; 040import com.clearnlp.reader.AbstractReader; 041import com.clearnlp.tokenization.AbstractTokenizer; 042 043/** 044 * <br> 045 * Copyright (c) 2012, Regents of the University of Colorado <br> 046 * All rights reserved. 047 * <p> 048 * This class provides a wrapper for the ClearNLP part of speech tokenizer for UIMA and/or ClearTK 049 * type systems. 050 * 051 * Subclasses should override the abstract methods to produce the annotations relevant for the 052 * target type system. 053 * 054 * This tagger is available here: 055 * <p> 056 * http://clearnlp.googlecode.com 057 * <p> 058 * 059 * @author Lee Becker 060 * 061 */ 062@Beta 063public abstract class Tokenizer_ImplBase<TOKEN_TYPE extends Annotation> extends 064 JCasAnnotator_ImplBase { 065 public static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary-1.2.0.zip"; 066 067 public static final String PARAM_LANGUAGE_CODE = "languageCode"; 068 069 @ConfigurationParameter( 070 name = PARAM_LANGUAGE_CODE, 071 mandatory = false, 072 description = "Language code for the tokenizer (default value=en).", 073 defaultValue = AbstractReader.LANG_EN) 074 private String languageCode; 075 076 public static final String PARAM_DICTIONARY_URI = "dictionaryUri"; 077 078 @ConfigurationParameter( 079 name = PARAM_DICTIONARY_URI, 080 mandatory = false, 081 description = "This parameter provides the URI of the tokenizer dictionary file.") 082 private URI dictionaryUri; 083 084 public static final String PARAM_WINDOW_CLASS = "windowClass"; 085 086 private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. " 087 + "By default, the tokenizer will tokenize a document sentence by sentence. If you do not want to precede tokenization with" 088 + "sentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'"; 089 090 @ConfigurationParameter( 091 name = PARAM_WINDOW_CLASS, 092 mandatory = false, 093 description = WINDOW_TYPE_DESCRIPTION, 094 defaultValue = "org.cleartk.token.type.Sentence") 095 private Class<? extends Annotation> windowClass; 096 097 private AbstractTokenizer tokenizer; 098 099 protected abstract TokenOps<TOKEN_TYPE> getTokenOps(); 100 101 @Override 102 public void initialize(UimaContext context) throws ResourceInitializationException { 103 super.initialize(context); 104 try { 105 this.tokenizer = NLPGetter.getTokenizer(languageCode); 106 } catch (Exception e) { 107 throw new ResourceInitializationException(e); 108 } 109 } 110 111 @Override 112 public void process(JCas jCas) throws AnalysisEngineProcessException { 113 for (Annotation window : JCasUtil.select(jCas, this.windowClass)) { 114 String windowText = window.getCoveredText(); 115 int windowOffset = window.getBegin(); 116 List<String> tokens = tokenizer.getTokens(windowText); 117 118 int offset = 0; 119 for (String token : tokens) { 120 int tokenBegin = windowText.indexOf(token, offset); 121 int tokenEnd = tokenBegin + token.length(); 122 try { 123 this.getTokenOps().createToken(jCas, windowOffset + tokenBegin, windowOffset + tokenEnd); 124 } catch (Exception e) { 125 throw new AnalysisEngineProcessException(e); 126 } 127 offset = tokenEnd; 128 } 129 } 130 } 131 132}