/*
 * Decompiled with CFR 0.152.
 */
package chalk.tools.cmdline.tokenizer;

import chalk.tools.cmdline.AbstractTrainerTool;
import chalk.tools.cmdline.CmdLineUtil;
import chalk.tools.cmdline.TerminateToolException;
import chalk.tools.cmdline.params.TrainingToolParams;
import chalk.tools.cmdline.tokenizer.TrainingParams;
import chalk.tools.dictionary.Dictionary;
import chalk.tools.tokenize.TokenSample;
import chalk.tools.tokenize.TokenizerFactory;
import chalk.tools.tokenize.TokenizerME;
import chalk.tools.tokenize.TokenizerModel;
import chalk.tools.util.model.ModelUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import nak.core.TrainUtil;

public final class TokenizerTrainerTool
extends AbstractTrainerTool<TokenSample, TrainerToolParams> {
    public TokenizerTrainerTool() {
        super(TokenSample.class, TrainerToolParams.class);
    }

    @Override
    public String getShortDescription() {
        return "trainer for the learnable tokenizer";
    }

    static Dictionary loadDict(File file) throws IOException {
        Dictionary dictionary = null;
        if (file != null) {
            CmdLineUtil.checkInputFile("abb dict", file);
            dictionary = new Dictionary(new FileInputStream(file));
        }
        return dictionary;
    }

    @Override
    public void run(String string, String[] stringArray) {
        TokenizerModel tokenizerModel;
        super.run(string, stringArray);
        this.mlParams = CmdLineUtil.loadTrainingParameters(((TrainerToolParams)this.params).getParams(), false);
        if (this.mlParams != null) {
            if (!TrainUtil.isValid(this.mlParams.getSettings())) {
                throw new TerminateToolException(1, "Training parameters file '" + ((TrainerToolParams)this.params).getParams() + "' is invalid!");
            }
            if (TrainUtil.isSequenceTraining(this.mlParams.getSettings())) {
                throw new TerminateToolException(1, "Sequence training is not supported!");
            }
        }
        if (this.mlParams == null) {
            this.mlParams = ModelUtil.createTrainingParameters(((TrainerToolParams)this.params).getIterations(), ((TrainerToolParams)this.params).getCutoff());
        }
        File file = ((TrainerToolParams)this.params).getModel();
        CmdLineUtil.checkOutputFile("tokenizer model", file);
        try {
            Dictionary dictionary = TokenizerTrainerTool.loadDict(((TrainerToolParams)this.params).getAbbDict());
            TokenizerFactory tokenizerFactory = TokenizerFactory.create(((TrainerToolParams)this.params).getFactory(), this.factory.getLang(), dictionary, ((TrainerToolParams)this.params).getAlphaNumOpt(), null);
            tokenizerModel = TokenizerME.train(this.sampleStream, tokenizerFactory, this.mlParams);
        }
        catch (IOException iOException) {
            throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + iOException.getMessage(), iOException);
        }
        finally {
            try {
                this.sampleStream.close();
            }
            catch (IOException iOException) {}
        }
        CmdLineUtil.writeModel("tokenizer", file, tokenizerModel);
    }

    static interface TrainerToolParams
    extends TrainingParams,
    TrainingToolParams {
    }
}

