/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.ne;

import edu.nyu.jet.lex.Tokenizer;
import edu.nyu.jet.ne.Dictionary;
import edu.nyu.jet.ne.DictionaryTagger;
import edu.nyu.jet.ne.NamedEntityUtil;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.util.Cdb;
import edu.nyu.jet.util.CdbBuilder;
import edu.nyu.jet.util.DoubleArrayTrie;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class TrieDictionary
extends Dictionary {
    private DoubleArrayTrie trie = new DoubleArrayTrie();
    private Cdb cdb;
    private StringBuilder text;
    private int[] indexes;

    public TrieDictionary(File trieFile, File cdbFile) throws IOException {
        this.trie.load(trieFile);
        this.cdb = new Cdb(cdbFile);
    }

    public TrieDictionary(String trieFilename, String cdbFilename) throws IOException {
        this(new File(trieFilename), new File(cdbFilename));
    }

    @Override
    public void lookupStart(String[] tokens) {
        this.text = new StringBuilder();
        this.indexes = new int[tokens.length];
        for (int i = 0; i < tokens.length; ++i) {
            this.indexes[i] = this.text.length();
            this.text.append(tokens[i]);
            this.text.append(' ');
        }
    }

    @Override
    public Dictionary.Entry lookup(int pos) {
        DoubleArrayTrie.Result result = this.trie.getLongestCommonPrefix(this.text, this.indexes[pos]);
        if (result != null) {
            try {
                byte[] value;
                byte[] key = this.text.subSequence(this.indexes[pos], this.indexes[pos] + result.getLength() - 1).toString().getBytes("UTF-8");
                HashSet<String> values = new HashSet<String>();
                this.cdb.findstart();
                while ((value = this.cdb.findNext(key)) != null) {
                    values.add(new String(value, "UTF-8"));
                }
                int length = 0;
                while (pos + length < this.indexes.length && this.indexes[pos + length] - this.indexes[pos] < result.getLength()) {
                    ++length;
                }
                return new Dictionary.Entry(length, values);
            }
            catch (UnsupportedEncodingException ex) {
                throw new RuntimeException(ex);
            }
        }
        return null;
    }

    public static void main(String[] args) throws IOException {
        String text = TrieDictionary.readText("sample.txt", "ISO-8859-1");
        TrieDictionary.testDict(text, "data/wsj.ned.da", "data/wsj.ned.cdb");
    }

    private static SortedMap<String, Set<String>> loadDict(String filename, String encoding) throws IOException {
        String line;
        TreeMap<String, Set<String>> dict = new TreeMap<String, Set<String>>();
        BufferedReader in = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(filename), encoding));
        Pattern delimiter = Pattern.compile("\\s+/\\s+");
        while ((line = in.readLine()) != null) {
            String[] tmp = delimiter.split(line, 2);
            String word = tmp[0].replaceAll("\\\\/", "/").replaceAll("\\s+", " ").trim() + " ";
            String ne = tmp[1].trim();
            TreeSet<String> value = (TreeSet<String>)dict.get(word);
            if (value == null) {
                value = new TreeSet<String>();
                dict.put(word, value);
            }
            value.add(ne);
        }
        return dict;
    }

    private static DoubleArrayTrie buildTrie(SortedMap<String, Set<String>> dict) {
        char[][] keys = new char[dict.size()][];
        int i = 0;
        for (String key : dict.keySet()) {
            keys[i] = key.toCharArray();
            ++i;
        }
        DoubleArrayTrie trie = new DoubleArrayTrie();
        trie.build(keys, null);
        return trie;
    }

    private static void buildCdb(SortedMap<String, Set<String>> dict, String dbFilename) throws IOException {
        CdbBuilder builder = new CdbBuilder(dbFilename, dbFilename + ".tmp");
        for (Map.Entry<String, Set<String>> entry : dict.entrySet()) {
            byte[] key = entry.getKey().getBytes("UTF-8");
            for (String ne : entry.getValue()) {
                byte[] value = ne.getBytes("UTF-8");
                builder.add(key, value);
            }
        }
        builder.finish();
    }

    public static String readText(String filename, String encoding) throws IOException {
        String line;
        BufferedReader in = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(filename), encoding));
        StringBuilder builder = new StringBuilder();
        while ((line = in.readLine()) != null) {
            builder.append(line);
            builder.append("\n");
        }
        in.close();
        return builder.toString();
    }

    public static void prepare(String dictFilename, String encoding, String trieFilename, String cdbFilename) throws IOException {
        SortedMap<String, Set<String>> dict = TrieDictionary.loadDict(dictFilename, encoding);
        DoubleArrayTrie trie = TrieDictionary.buildTrie(dict);
        trie.save(trieFilename);
        TrieDictionary.buildCdb(dict, cdbFilename);
    }

    public static void testDict(String text, String trieFilename, String cdbFilename) throws IOException {
        Document doc = new Document(text);
        Tokenizer.tokenize(doc, doc.fullSpan());
        TrieDictionary dict = new TrieDictionary(trieFilename, cdbFilename);
        DictionaryTagger tagger = new DictionaryTagger();
        tagger.setDictionary(dict);
        tagger.annotate(doc);
        NamedEntityUtil.packNamedEntity(doc, null);
        System.out.println(doc.writeSGML("ENAMEX"));
    }
}

