/*
 * Decompiled with CFR 0.152.
 */
package org.cleartk.clearnlp;

import com.clearnlp.nlp.NLPGetter;
import com.clearnlp.tokenization.AbstractTokenizer;
import com.google.common.annotations.Beta;
import java.net.URI;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.clearnlp.TokenOps;

@Beta
public abstract class Tokenizer_ImplBase<TOKEN_TYPE extends Annotation>
extends JCasAnnotator_ImplBase {
    public static final String DEFAULT_DICTIONARY_FILE_NAME = "dictionary-1.2.0.zip";
    public static final String PARAM_LANGUAGE_CODE = "languageCode";
    @ConfigurationParameter(name="languageCode", mandatory=false, description="Language code for the tokenizer (default value=en).", defaultValue={"en"})
    private String languageCode;
    public static final String PARAM_DICTIONARY_URI = "dictionaryUri";
    @ConfigurationParameter(name="dictionaryUri", mandatory=false, description="This parameter provides the URI of the tokenizer dictionary file.")
    private URI dictionaryUri;
    public static final String PARAM_WINDOW_CLASS = "windowClass";
    private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization withsentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
    @ConfigurationParameter(name="windowClass", mandatory=false, description="specifies the class type of annotations that will be tokenized. By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization withsentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'", defaultValue={"org.cleartk.token.type.Sentence"})
    private Class<? extends Annotation> windowClass;
    private AbstractTokenizer tokenizer;

    protected abstract TokenOps<TOKEN_TYPE> getTokenOps();

    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        try {
            this.tokenizer = NLPGetter.getTokenizer((String)this.languageCode);
        }
        catch (Exception e) {
            throw new ResourceInitializationException((Throwable)e);
        }
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        for (Annotation window : JCasUtil.select((JCas)jCas, this.windowClass)) {
            String windowText = window.getCoveredText();
            int windowOffset = window.getBegin();
            List tokens = this.tokenizer.getTokens(windowText);
            int offset = 0;
            for (String token : tokens) {
                int tokenBegin = windowText.indexOf(token, offset);
                int tokenEnd = tokenBegin + token.length();
                try {
                    this.getTokenOps().createToken(jCas, windowOffset + tokenBegin, windowOffset + tokenEnd);
                }
                catch (Exception e) {
                    throw new AnalysisEngineProcessException((Throwable)e);
                }
                offset = tokenEnd;
            }
        }
    }
}

