/*
 * Decompiled with CFR 0.152.
 */
package org.maochen.nlp.sentencetypeclassifier;

import com.google.common.collect.Sets;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.maochen.nlp.datastructure.DTree;
import org.maochen.nlp.sentencetypeclassifier.FeatureExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class TrainingFeatureExtractor
extends FeatureExtractor {
    private static final Logger LOG = LoggerFactory.getLogger(TrainingFeatureExtractor.class);
    private Map<String, DTree> depTreeCache = new HashMap<String, DTree>();
    private ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(), new ThreadFactoryBuilder().setNameFormat("MaxEnt-FeatureExtractor-%d").build());

    public TrainingFeatureExtractor(String filepathPrefix, String delimiter) {
        super(filepathPrefix, delimiter);
    }

    public void extractFeature(Set<String> trainingData) {
        File featureVectorFile = new File(this.filepathPrefix + "/featureVector.txt");
        if (!featureVectorFile.exists()) {
            try {
                featureVectorFile.createNewFile();
                FileWriter fw = new FileWriter(featureVectorFile.getAbsoluteFile());
                BufferedWriter bw = new BufferedWriter(fw);
                Set<String> featureVector = this.getFeats(trainingData);
                for (String s : featureVector) {
                    bw.write(s + System.getProperty("line.separator"));
                }
                bw.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    private void addToMap(Map<String, Integer> ngramMap, String ... tokens) {
        String chunk = "";
        for (String token : tokens) {
            chunk = chunk + "_" + token;
        }
        int count = ngramMap.containsKey(chunk = chunk.substring(1)) ? ngramMap.get(chunk) : 0;
        ngramMap.put(chunk, ++count);
    }

    private void generateDEPNGram(DTree tree) {
        String depString = super.getDEPString(tree);
        String[] depStringTokens = depString.split("_");
        for (int i = 0; i < depStringTokens.length; ++i) {
            if (i + 1 < depStringTokens.length) {
                this.addToMap(this.biGramDepMap, depStringTokens[i], depStringTokens[i + 1]);
            }
            if (i + 2 >= depStringTokens.length) continue;
            this.addToMap(this.triGramDepMap, depStringTokens[i], depStringTokens[i + 1], depStringTokens[i + 2]);
        }
    }

    private void generateWordNGram(String sentence) {
        String strWithStartEndTag = "<sentence>_" + sentence.toLowerCase() + "_</sentence>";
        String[] tokens = strWithStartEndTag.split("_");
        for (int i = 0; i < tokens.length; ++i) {
            if (i + 1 < tokens.length) {
                this.addToMap(this.biGramWordMap, tokens[i], tokens[i + 1]);
            }
            if (i + 2 >= tokens.length) continue;
            this.addToMap(this.triGramWordMap, tokens[i], tokens[i + 1], tokens[i + 2]);
        }
    }

    private Set<String> getFeats(Set<String> trainEntries) {
        LOG.info("Extracting Features ...");
        Set vectorSet = Sets.newSetFromMap(new ConcurrentHashMap());
        LOG.info("Generating NGram Model ...");
        for (String str : trainEntries) {
            str = str.split(this.delimiter)[0];
            DTree tree = this.parser.parse(str.replaceAll("_", " "));
            this.depTreeCache.put(str, tree);
            this.generateDEPNGram(tree);
            this.generateWordNGram(str);
        }
        this.persistNGram();
        LOG.info("NGram Model completed ...");
        ArrayList<Future<String>> futureList = new ArrayList<Future<String>>();
        String delimiter = this.delimiter;
        for (String string : trainEntries) {
            Callable<String> entryCallable = () -> {
                String featVector = this.getFeats(entry, this.depTreeCache.get(entry.split(delimiter)[0]));
                vectorSet.add(featVector);
                return featVector;
            };
            Future<String> future = this.executorService.submit(entryCallable);
            futureList.add(future);
        }
        for (Future future : futureList) {
            try {
                future.get();
            }
            catch (Exception e) {
                e.printStackTrace();
            }
        }
        LOG.info("Extracting features completed.");
        return vectorSet;
    }

    private void serialize(String filepath, Map<String, Integer> dataMap) {
        try {
            ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(filepath));
            oos.writeObject(dataMap);
            oos.close();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void persistNGram() {
        String biGramWordFile = this.filepathPrefix + "/bigram_word";
        String triGramWordFile = this.filepathPrefix + "/trigram_word";
        String biGramDepFile = this.filepathPrefix + "/bigram_dep";
        String triGramDepFile = this.filepathPrefix + "/trigram_dep";
        this.serialize(biGramWordFile, this.biGramWordMap);
        this.serialize(triGramWordFile, this.triGramWordMap);
        this.serialize(biGramDepFile, this.biGramDepMap);
        this.serialize(triGramDepFile, this.triGramDepMap);
    }
}

