/*
 * Decompiled with CFR 0.152.
 */
package org.cleartk.clearnlp;

import com.google.common.annotations.Beta;
import com.google.common.collect.Lists;
import edu.emory.clir.clearnlp.component.utils.NLPUtils;
import edu.emory.clir.clearnlp.tokenization.AbstractTokenizer;
import edu.emory.clir.clearnlp.util.lang.TLanguage;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.clearnlp.SentenceOps;
import org.cleartk.clearnlp.TokenOps;

@Beta
public abstract class Tokenizer_ImplBase<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation>
extends JCasAnnotator_ImplBase {
    public static final String PARAM_SEGMENT_SENTENCES = "segmentSentences";
    @ConfigurationParameter(name="segmentSentences", mandatory=false, description="Turn on flag to include sentence segmentation", defaultValue={"false"})
    private Boolean segmentSentences;
    public static final String PARAM_LANGUAGE_CODE = "languageCode";
    @ConfigurationParameter(name="languageCode", mandatory=false, description="Language code for the tokenizer (default value=en).", defaultValue={"ENGLISH"})
    private String languageCode;
    public static final String PARAM_WINDOW_CLASS = "windowClass";
    private static final String WINDOW_TYPE_DESCRIPTION = "specifies the class type of annotations that will be tokenized. By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization withsentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'";
    @ConfigurationParameter(name="windowClass", mandatory=false, description="specifies the class type of annotations that will be tokenized. By default, the tokenizer will tokenize a document sentence by sentence.  If you do not want to precede tokenization withsentence segmentation, then a reasonable value for this parameter is 'org.apache.uima.jcas.tcas.DocumentAnnotation'")
    private Class<? extends Annotation> windowClass;
    private AbstractTokenizer tokenizer;

    protected abstract TokenOps<TOKEN_TYPE> getTokenOps();

    protected abstract SentenceOps<SENTENCE_TYPE> getSentenceOps();

    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        try {
            this.tokenizer = NLPUtils.getTokenizer((TLanguage)TLanguage.getType((String)this.languageCode));
        }
        catch (Exception e) {
            throw new ResourceInitializationException((Throwable)e);
        }
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        if (this.segmentSentences.booleanValue()) {
            if (this.windowClass == null) {
                this.segmentSentencesAndTokenizeText(jCas, jCas.getDocumentText(), 0);
            } else {
                for (Annotation window : JCasUtil.select((JCas)jCas, this.windowClass)) {
                    this.segmentSentencesAndTokenizeText(jCas, window.getCoveredText(), window.getBegin());
                }
            }
        } else if (this.windowClass == null) {
            this.tokenizeText(jCas, jCas.getDocumentText(), 0);
        } else {
            for (Annotation window : JCasUtil.select((JCas)jCas, this.windowClass)) {
                this.tokenizeText(jCas, window.getCoveredText(), window.getBegin());
            }
        }
    }

    void segmentSentencesAndTokenizeText(JCas jCas, String text, int textOffset) throws AnalysisEngineProcessException {
        InputStream stream = IOUtils.toInputStream((String)text);
        List sentencesTokens = this.tokenizer.segmentize(stream);
        int offset = textOffset;
        for (List sentenceTokenStrings : sentencesTokens) {
            int sentenceBegin = -1;
            int sentenceEnd = -1;
            ArrayList tokens = Lists.newArrayList();
            for (String token : sentenceTokenStrings) {
                int tokenBegin = text.indexOf(token, offset);
                int tokenEnd = tokenBegin + token.length();
                try {
                    Annotation t = (Annotation)this.getTokenOps().createToken(jCas, tokenBegin, tokenEnd);
                    tokens.add(t);
                }
                catch (Exception e) {
                    throw new AnalysisEngineProcessException((Throwable)e);
                }
                if (sentenceBegin < 0) {
                    sentenceBegin = tokenBegin;
                }
                offset = tokenEnd;
                sentenceEnd = tokenEnd;
            }
            if (sentenceBegin < 0 || sentenceEnd < 0) continue;
            this.getSentenceOps().createSentence(jCas, sentenceBegin, sentenceEnd);
        }
    }

    void tokenizeText(JCas jCas, String text, int textOffset) throws AnalysisEngineProcessException {
        List tokens = this.tokenizer.tokenize(text);
        int offset = 0;
        for (String token : tokens) {
            int tokenBegin = text.indexOf(token, offset);
            int tokenEnd = tokenBegin + token.length();
            try {
                this.getTokenOps().createToken(jCas, textOffset + tokenBegin, textOffset + tokenEnd);
            }
            catch (Exception e) {
                throw new AnalysisEngineProcessException((Throwable)e);
            }
            offset = tokenEnd;
        }
    }
}

