/*
 * Decompiled with CFR 0.152.
 */
package chalk.uima.tokenize;

import chalk.tools.tokenize.TokenSample;
import chalk.tools.tokenize.TokenSampleStream;
import chalk.tools.tokenize.TokenizerME;
import chalk.tools.tokenize.TokenizerModel;
import chalk.tools.util.ObjectStream;
import chalk.tools.util.ObjectStreamUtils;
import chalk.tools.util.PlainTextByLineStream;
import chalk.tools.util.Span;
import chalk.uima.util.CasConsumerUtil;
import chalk.uima.util.ContainingConstraint;
import chalk.uima.util.OpennlpUtil;
import chalk.uima.util.SampleTraceStream;
import chalk.uima.util.UimaUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import nak.maxent.GIS;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FSMatchConstraint;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.ProcessTrace;

public final class TokenizerTrainer
extends CasConsumer_ImplBase {
    public static final String IS_ALPHA_NUMERIC_OPTIMIZATION = "opennlp.uima.tokenizer.IsAlphaNumericOptimization";
    private List<TokenSample> tokenSamples = new ArrayList<TokenSample>();
    private UimaContext mContext;
    private Type mSentenceType;
    private Type mTokenType;
    private String mModelName;
    private String additionalTrainingDataFile;
    private String additionalTrainingDataEncoding;
    private String language;
    private Boolean isSkipAlphaNumerics;
    private Logger mLogger;
    private String sampleTraceFileEncoding;
    private File sampleTraceFile;

    public void initialize() throws ResourceInitializationException {
        String string;
        super.initialize();
        this.mContext = this.getUimaContext();
        this.mLogger = this.mContext.getLogger();
        if (this.mLogger.isLoggable(Level.INFO)) {
            this.mLogger.log(Level.INFO, "Initializing the OpenNLP Tokenizer trainer.");
        }
        this.mModelName = CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.MODEL_PARAMETER);
        this.language = CasConsumerUtil.getRequiredStringParameter(this.mContext, "opennlp.uima.Language");
        this.isSkipAlphaNumerics = CasConsumerUtil.getOptionalBooleanParameter(this.mContext, IS_ALPHA_NUMERIC_OPTIMIZATION);
        if (this.isSkipAlphaNumerics == null) {
            this.isSkipAlphaNumerics = false;
        }
        this.additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(this.getUimaContext(), "opennlp.uima.AdditionalTrainingDataFile");
        if (this.additionalTrainingDataFile != null) {
            this.additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(this.getUimaContext(), "opennlp.uima.AdditionalTrainingDataEncoding");
        }
        if ((string = CasConsumerUtil.getOptionalStringParameter(this.getUimaContext(), "opennlp.uima.SampleTraceFile")) != null) {
            this.sampleTraceFile = new File(this.getUimaContextAdmin().getResourceManager().getDataPath() + File.separatorChar + string);
            this.sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(this.getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
        }
    }

    public void typeSystemInit(TypeSystem typeSystem) throws ResourceInitializationException {
        String string = CasConsumerUtil.getRequiredStringParameter(this.mContext, UimaUtil.SENTENCE_TYPE_PARAMETER);
        this.mSentenceType = CasConsumerUtil.getType(typeSystem, string);
        String string2 = CasConsumerUtil.getRequiredStringParameter(this.mContext, "opennlp.uima.TokenType");
        this.mTokenType = CasConsumerUtil.getType(typeSystem, string2);
    }

    public void processCas(CAS cAS) {
        AnnotationIndex annotationIndex = cAS.getAnnotationIndex(this.mSentenceType);
        for (AnnotationFS annotationFS : annotationIndex) {
            this.process(cAS, annotationFS);
        }
    }

    private void process(CAS cAS, AnnotationFS annotationFS) {
        Object[] objectArray;
        AnnotationIndex annotationIndex = cAS.getAnnotationIndex(this.mTokenType);
        ContainingConstraint containingConstraint = new ContainingConstraint(annotationFS);
        FSIterator fSIterator = cAS.createFilteredIterator(annotationIndex.iterator(), (FSMatchConstraint)containingConstraint);
        LinkedList<Span> linkedList = new LinkedList<Span>();
        while (fSIterator.hasNext()) {
            objectArray = (Object[])fSIterator.next();
            linkedList.add(new Span(objectArray.getBegin() - annotationFS.getBegin(), objectArray.getEnd() - annotationFS.getBegin()));
        }
        objectArray = linkedList.toArray(new Span[linkedList.size()]);
        Arrays.sort(objectArray);
        this.tokenSamples.add(new TokenSample(annotationFS.getCoveredText(), (Span[])objectArray));
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void collectionProcessComplete(ProcessTrace processTrace) throws ResourceProcessException, IOException {
        TokenizerModel tokenizerModel;
        Object object;
        if (this.mLogger.isLoggable(Level.INFO)) {
            this.mLogger.log(Level.INFO, "Collected " + this.tokenSamples.size() + " token samples.");
        }
        GIS.PRINT_MESSAGES = false;
        ObjectStream<TokenSample> objectStream = ObjectStreamUtils.createObjectStream(this.tokenSamples);
        InputStream inputStream = null;
        OutputStreamWriter outputStreamWriter = null;
        try {
            if (this.additionalTrainingDataFile != null) {
                if (this.mLogger.isLoggable(Level.INFO)) {
                    this.mLogger.log(Level.INFO, "Using addional training data file: " + this.additionalTrainingDataFile);
                }
                inputStream = new FileInputStream(this.additionalTrainingDataFile);
                object = new TokenSampleStream(new PlainTextByLineStream(new InputStreamReader(inputStream, this.additionalTrainingDataEncoding)));
                objectStream = ObjectStreamUtils.createObjectStream(new ObjectStream[]{objectStream, object});
            }
            if (this.sampleTraceFile != null) {
                outputStreamWriter = new OutputStreamWriter((OutputStream)new FileOutputStream(this.sampleTraceFile), this.sampleTraceFileEncoding);
                objectStream = new SampleTraceStream<TokenSample>(objectStream, outputStreamWriter);
            }
            tokenizerModel = TokenizerME.train(this.language, objectStream, this.isSkipAlphaNumerics);
        }
        finally {
            if (inputStream != null) {
                inputStream.close();
            }
        }
        this.tokenSamples = null;
        object = new File(this.getUimaContextAdmin().getResourceManager().getDataPath() + File.separatorChar + this.mModelName);
        OpennlpUtil.serialize(tokenizerModel, (File)object);
    }

    public boolean isStateless() {
        return false;
    }

    public void destroy() {
        this.tokenSamples = null;
    }
}

