/*
 * Decompiled with CFR 0.152.
 */
package org.cogroo.gc.cmdline.dictionary;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.SortedMap;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.postag.Triple;
import opennlp.tools.util.featuregen.StringPattern;
import org.cogroo.entities.impl.MorphologicalTag;
import org.cogroo.formats.ad.ADFeaturizerSampleStream;
import org.cogroo.gc.cmdline.dictionary.POSDictionaryBuilderParams;
import org.cogroo.interpreters.FlorestaTagInterpreter;
import org.cogroo.interpreters.JspellTagInterpreter;
import org.cogroo.interpreters.TagInterpreter;
import org.cogroo.tools.featurizer.FeatureSample;

public class TabSeparatedPOSDictionaryBuilderTool
extends BasicCmdLineTool {
    private static final char HT = '\t';
    private static final char NL = '\n';

    public String getShortDescription() {
        return "builds a new tab separated lexical dictionary to be used with FSA builder";
    }

    public String getHelp() {
        return this.getBasicHelp(Params.class);
    }

    public void run(String[] args) {
        Params params = (Params)this.validateAndParseParams(args, Params.class);
        File dictInFile = params.getInputFile();
        File dictOutFile = params.getOutputFile();
        File corpusFile = params.getCorpus();
        Charset encoding = params.getEncoding();
        CmdLineUtil.checkInputFile((String)"dictionary input file", (File)dictInFile);
        CmdLineUtil.checkOutputFile((String)"dictionary output file", (File)dictOutFile);
        CmdLineUtil.checkInputFile((String)"corpus input file", (File)corpusFile);
        InputStreamReader in = null;
        OutputStreamWriter out = null;
        try {
            ADFeaturizerSampleStream sentenceStream = new ADFeaturizerSampleStream(new FileInputStream(corpusFile), "ISO-8859-1", params.getExpandME());
            HashSet<String> knownFeats = new HashSet<String>();
            HashSet<String> knownPostags = new HashSet<String>();
            FeatureSample sample = sentenceStream.read();
            while (sample != null) {
                Collections.addAll(knownFeats, sample.getFeatures());
                Collections.addAll(knownPostags, sample.getTags());
                sample = sentenceStream.read();
            }
            sentenceStream.close();
            in = new InputStreamReader((InputStream)new FileInputStream(dictInFile), encoding);
            TreeMap<String, Set<Triple>> entries = new TreeMap<String, Set<Triple>>();
            TabSeparatedPOSDictionaryBuilderTool.parseOneEntryPerLine(in, entries, new JspellTagInterpreter(), new FlorestaTagInterpreter(), knownFeats, knownPostags, params.getAllowInvalidFeats(), params.getIsIncludeFeatures());
            in.close();
            TreeMap added = new TreeMap();
            if (params.getIncludeFromCorpus().booleanValue()) {
                sentenceStream = new ADFeaturizerSampleStream(new FileInputStream(corpusFile), "ISO-8859-1", params.getExpandME());
                sample = sentenceStream.read();
                while (sample != null) {
                    String[] toks = sample.getSentence();
                    String[] lemmas = sample.getLemmas();
                    String[] tags = sample.getTags();
                    String[] feats = sample.getFeatures();
                    for (int i = 0; i < toks.length; ++i) {
                        String tok = !"prop".equals(tags[i]) ? toks[i].toLowerCase() : toks[i];
                        if (!this.isValid((Collection)entries.get(tok), tok, tags[i], lemmas[i], feats[i], params.getIsIncludeFeatures())) continue;
                        Triple t = TabSeparatedPOSDictionaryBuilderTool.asTriple(tags[i], lemmas[i], feats[i], params.getIsIncludeFeatures());
                        TabSeparatedPOSDictionaryBuilderTool.put(tok, t, entries);
                        if ("prop".equals(t.getClazz())) continue;
                        if (!added.containsKey(tok)) {
                            added.put(tok, new HashSet());
                        }
                        ((Set)added.get(tok)).add(t.toString());
                    }
                    sample = sentenceStream.read();
                }
                sentenceStream.close();
                for (String k : added.keySet()) {
                    Set e = (Set)added.get(k);
                    for (String v : e) {
                        System.out.println(k + " - " + v);
                    }
                }
            }
            out = new OutputStreamWriter((OutputStream)new FileOutputStream(dictOutFile), "UTF-8");
            for (String token : entries.keySet()) {
                for (Triple triple : (Set)entries.get(token)) {
                    out.append(TabSeparatedPOSDictionaryBuilderTool.toString(token, triple));
                }
            }
            out.close();
        }
        catch (IOException e) {
            throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.getMessage());
        }
        catch (Exception e) {
            throw new TerminateToolException(-1, "Exception: " + e.getMessage());
        }
        finally {
            try {
                in.close();
                out.close();
            }
            catch (IOException e) {}
        }
    }

    private static void put(String tok, Triple t, SortedMap<String, Set<Triple>> entries) {
        if (!entries.containsKey(tok)) {
            entries.put(tok, new HashSet());
        }
        ((Set)entries.get(tok)).add(t);
    }

    private boolean isValid(Collection<Triple> knownTriples, String tok, String clazz, String lemma, String feats, boolean includeFeatures) {
        if (StringPattern.recognize((String)tok).containsDigit()) {
            return false;
        }
        if (clazz.startsWith("B-") || clazz.startsWith("I-")) {
            return false;
        }
        if (knownTriples != null && knownTriples.size() > 0) {
            HashSet<String> entries = new HashSet<String>();
            for (Triple t : knownTriples) {
                String tFeat = null;
                if (includeFeatures) {
                    tFeat = t.getFeats();
                }
                entries.add(t.getClazz() + "|" + tFeat);
            }
            String f = null;
            if (includeFeatures) {
                f = feats;
            }
            if (entries.contains(clazz + "|" + f)) {
                return false;
            }
        }
        return true;
    }

    public static void parseOneEntryPerLine(Reader in, SortedMap<String, Set<Triple>> entries, TagInterpreter tago, TagInterpreter tagd, Set<String> knownFeats, Set<String> knownPostags, boolean allowInvalidFeats, boolean includeFeatures) throws IOException {
        String line;
        knownFeats = new TreeSet<String>(knownFeats);
        if (!includeFeatures) {
            allowInvalidFeats = true;
        }
        BufferedReader lineReader = new BufferedReader(in);
        TreeSet<String> unknownTags = new TreeSet<String>();
        while ((line = lineReader.readLine()) != null) {
            StringTokenizer whiteSpaceTokenizer = new StringTokenizer(line, " ");
            String word = whiteSpaceTokenizer.nextToken();
            while (whiteSpaceTokenizer.hasMoreTokens()) {
                String data = whiteSpaceTokenizer.nextToken();
                String[] lemmaTag = data.split(">");
                if (lemmaTag.length != 2) {
                    System.err.println("** Invalid lemmatag. " + word + " -> " + data);
                    continue;
                }
                MorphologicalTag completeTag = tago.parseMorphologicalTag(lemmaTag[1]);
                if (completeTag == null || completeTag.getClazzE() == null) {
                    System.err.println("-- Missing class tag. " + word + " -> " + data);
                    continue;
                }
                MorphologicalTag classMT = new MorphologicalTag();
                classMT.setClazz(completeTag.getClazzE());
                String classString = tagd.serialize(classMT);
                if (classString == null) {
                    System.out.println("erro :(");
                }
                MorphologicalTag featsMT = completeTag.clone();
                featsMT.setClazz(null);
                String featsString = null;
                if (!featsMT.isEmpty()) {
                    featsString = tagd.serialize(featsMT);
                }
                if (featsString == null || featsString.length() == 0) {
                    featsString = "-";
                }
                if (classString.startsWith("v-") && word.contains("-")) continue;
                if ("pron".equals(classString)) {
                    if (!knownFeats.contains(featsString) && !allowInvalidFeats) continue;
                    TabSeparatedPOSDictionaryBuilderTool.put(word, TabSeparatedPOSDictionaryBuilderTool.asTriple("pron-det", lemmaTag[0], featsString, includeFeatures), entries);
                    TabSeparatedPOSDictionaryBuilderTool.put(word, TabSeparatedPOSDictionaryBuilderTool.asTriple("pron-indp", lemmaTag[0], featsString, includeFeatures), entries);
                    continue;
                }
                if (classString != null && knownPostags.contains(classString) && (knownFeats.contains(featsString) || allowInvalidFeats)) {
                    TabSeparatedPOSDictionaryBuilderTool.put(word, TabSeparatedPOSDictionaryBuilderTool.asTriple(classString, lemmaTag[0], featsString, includeFeatures), entries);
                    continue;
                }
                if ("pnt".equals(classString) && knownPostags.contains(word)) {
                    TabSeparatedPOSDictionaryBuilderTool.put(word, TabSeparatedPOSDictionaryBuilderTool.asTriple(word, word, null, includeFeatures), entries);
                } else if (!classString.startsWith("v-")) {
                    System.err.println("unknown - " + word + " -> " + new Triple(classString, lemmaTag[0], classString + "_" + featsString));
                }
                unknownTags.add(classString + "_" + featsString);
            }
        }
        if (knownFeats.size() > 0) {
            System.err.print("Known tags:");
            for (String tag : knownFeats) {
                System.err.print(" " + tag);
            }
            System.err.println();
        }
        if (unknownTags.size() > 0) {
            System.err.print("Found unknown tags:");
            for (String tag : unknownTags) {
                System.err.print(" " + tag);
            }
            System.err.println();
        }
    }

    private static Triple asTriple(String clazz, String lemma, String feats, boolean includeFeatures) {
        if (includeFeatures) {
            return new Triple(clazz, lemma, feats);
        }
        return new Triple(clazz, lemma, null);
    }

    private static String toString(String word, Triple t) {
        StringBuilder sb = new StringBuilder();
        sb.append(word).append('\t').append(t.getLemma()).append('\t').append(t.getClazz());
        if (t.getFeats() != null && t.getFeats().length() > 0) {
            sb.append("#").append(t.getFeats());
        }
        sb.append('\n');
        return sb.toString();
    }

    static interface Params
    extends POSDictionaryBuilderParams {
        @ArgumentParser.ParameterDescription(valueName="includeFetures", description="include features")
        @ArgumentParser.OptionalParameter(defaultValue="false")
        public Boolean getIsIncludeFeatures();

        @ArgumentParser.ParameterDescription(valueName="includeFromCorpus", description="include from corpus")
        @ArgumentParser.OptionalParameter(defaultValue="false")
        public Boolean getIncludeFromCorpus();

        @ArgumentParser.ParameterDescription(valueName="expandME", description="include from corpus")
        @ArgumentParser.OptionalParameter(defaultValue="false")
        public Boolean getExpandME();
    }
}

