/*
 * Decompiled with CFR 0.152.
 */
package org.maochen.nlp.app.chunker;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.maochen.nlp.app.chunker.CRFChunker;
import org.maochen.nlp.app.chunker.ChunkerFeatureExtractor;
import org.maochen.nlp.ml.SequenceTuple;
import org.maochen.nlp.ml.Tuple;
import org.maochen.nlp.ml.classifier.maxent.MaxEntClassifier;
import org.maochen.nlp.ml.util.TrainingDataUtils;
import org.maochen.nlp.ml.vector.IVector;
import org.maochen.nlp.ml.vector.LabeledVector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MaxEntChunker
extends MaxEntClassifier {
    private static final Logger LOG = LoggerFactory.getLogger(MaxEntChunker.class);
    public static String TRAIN_FILE_DELIMITER = "\t";

    public void train(String trainFilePath) throws FileNotFoundException {
        List<SequenceTuple> trainingData = TrainingDataUtils.readSeqFile(new FileInputStream(new File(trainFilePath)), TRAIN_FILE_DELIMITER, 2);
        LOG.info("Loaded Training data.");
        LOG.info("Generating feats");
        List<Tuple> trainingTuples = trainingData.parallelStream().map(ChunkerFeatureExtractor::extractFeat).flatMap(Collection::stream).collect(Collectors.toList());
        LOG.info("Extracted Feats.");
        super.train(trainingTuples);
    }

    public SequenceTuple predict(String[] words, String[] pos) {
        int i;
        SequenceTuple st = new SequenceTuple();
        st.entries = new ArrayList();
        for (i = 0; i < words.length; ++i) {
            LabeledVector v = new LabeledVector(new String[]{words[i], pos[i]});
            st.entries.add(new Tuple((IVector)v));
        }
        for (i = 0; i < st.entries.size(); ++i) {
            Tuple t = (Tuple)st.entries.get(i);
            String[] currentFeats = (String[])ChunkerFeatureExtractor.extractFeatSingle(i, words, pos).stream().toArray(String[]::new);
            ((LabeledVector)t.vector).featsName = currentFeats;
            t.vector.setVector(IntStream.range(0, currentFeats.length).mapToDouble(x -> 1.0).toArray());
            t.label = super.predict(t).entrySet().stream().max((t1, t2) -> ((Double)t1.getValue()).compareTo((Double)t2.getValue())).map(Map.Entry::getKey).get();
        }
        return st;
    }

    public void validate(String testFile) throws FileNotFoundException {
        List<SequenceTuple> testData = TrainingDataUtils.readSeqFile(new FileInputStream(new File(testFile)), TRAIN_FILE_DELIMITER, 2);
        int errCount = 0;
        int total = 0;
        for (SequenceTuple st : testData) {
            total += st.entries.size();
            ArrayList<String> expectedTags = new ArrayList<String>(st.getLabel());
            String[] words = (String[])st.entries.stream().map(x -> ((LabeledVector)x.vector).featsName[0]).toArray(String[]::new);
            String[] pos = (String[])st.entries.stream().map(x -> ((LabeledVector)x.vector).featsName[1]).toArray(String[]::new);
            st = this.predict(words, pos);
            boolean isThisSeqPrinted = false;
            for (int i = 0; i < expectedTags.size(); ++i) {
                if (((String)expectedTags.get(i)).equals(((Tuple)st.entries.get((int)i)).label)) continue;
                if (!isThisSeqPrinted) {
                    CRFChunker.printSequenceTuple(st, expectedTags);
                    System.out.println("");
                    isThisSeqPrinted = true;
                }
                ++errCount;
            }
        }
        System.out.println("Err/Total:\t" + errCount + "/" + total);
        System.out.println("Accuracy:\t" + (1.0 - (double)errCount / (double)total) * 100.0 + "%");
    }

    public static void main(String[] args) throws IOException {
        MaxEntChunker chunker = new MaxEntChunker();
        TRAIN_FILE_DELIMITER = " ";
        String modelPath = "/Users/mguan/Desktop/chunker.maxent.model";
        Properties para = new Properties();
        chunker.setParameter(para);
        String trainFile = "/Users/mguan/workspace/nlp-service_training-data/corpora/CoNLL_Shared_Task/CoNLL_2000_Chunking/train.txt";
        chunker.train(trainFile);
        chunker.persistModel(modelPath);
        chunker.loadModel(new FileInputStream(new File(modelPath)));
        chunker.validate("/Users/mguan/workspace/nlp-service_training-data/corpora/CoNLL_Shared_Task/CoNLL_2000_Chunking/test.txt");
    }
}

