/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.ne;

import edu.nyu.jet.lex.EnglishLex;
import edu.nyu.jet.lex.Tokenizer;
import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.ne.AlphaFeature;
import edu.nyu.jet.ne.Dictionary;
import edu.nyu.jet.ne.DocumentToSentenceIterator;
import edu.nyu.jet.ne.Evaluator;
import edu.nyu.jet.ne.FirstWordFeature;
import edu.nyu.jet.ne.LexiconCategoryFeature;
import edu.nyu.jet.ne.NamedEntityInDictionaryFeature;
import edu.nyu.jet.ne.NonAlphaFeature;
import edu.nyu.jet.ne.NumericalFeatures;
import edu.nyu.jet.ne.PatternFeature;
import edu.nyu.jet.ne.RegexpMatchFeature;
import edu.nyu.jet.ne.SentenceToTokenSequencePipe;
import edu.nyu.jet.ne.SummarizedPatternFeature;
import edu.nyu.jet.ne.TokenLowerText;
import edu.nyu.jet.ne.TrieDictionary;
import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.DocumentCollection;
import edu.nyu.jet.tipster.ExternalDocument;
import edu.nyu.jet.tipster.Span;
import edu.nyu.jet.util.IOUtils;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.zoner.SpecialZoner;
import edu.umass.cs.mallet.base.fst.CRF3;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.TokenSequence2FeatureVectorSequence;
import edu.umass.cs.mallet.base.pipe.tsf.TokenText;
import edu.umass.cs.mallet.base.types.Instance;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.types.Sequence;
import edu.umass.cs.mallet.base.types.TokenSequence;
import edu.umass.cs.mallet.base.util.PropertyList;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Vector;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class CRFNameTagger {
    private static final String PUNCTUATIONS = "[,\\.;:?!()]";
    private static final String QUOTES = "[\"`']";
    private static final String OPEN_PAREN = "[\\[({]";
    private static final String CLOSE_PAREN = "[\\])}]";
    private CRF3 crf;
    private List<Pipe> features = new ArrayList<Pipe>();
    private PropertyList properties;

    public CRFNameTagger() {
        this.addFeatures();
    }

    public void train(Collection<Document> docs) {
        Pipe pipe = this.createPipe();
        CRF3 crf = new CRF3(pipe, null);
        InstanceList trainingData = new InstanceList(pipe);
        for (Document doc : docs) {
            DocumentToSentenceIterator iter = new DocumentToSentenceIterator(doc, "TEXT", trainingData.size() + 1);
            while (iter.hasNext()) {
                Instance carrier = iter.nextInstance();
                carrier.setPropertyList(this.properties);
                trainingData.add(carrier.getPipedCopy(pipe));
            }
        }
        crf.addStatesForLabelsConnectedAsIn(trainingData);
        crf.train(trainingData);
        this.crf = crf;
    }

    public void annotate(Document doc, Span span) {
        Pipe pipe = this.crf.getInputPipe();
        Instance carrier = new Instance(span, null, "sentence", doc);
        carrier.setPropertyList(this.properties);
        carrier = pipe.pipe(carrier);
        Sequence input = (Sequence)carrier.getData();
        Sequence labels = this.crf.viterbiPath(input).output();
        Vector<Annotation> tokens = doc.annotationsOfType("token", span);
        assert (tokens.size() == input.size());
        assert (tokens.size() == labels.size());
        int pos = 0;
        while (pos < tokens.size()) {
            String label = (String)labels.get(pos);
            if (!label.startsWith("B-")) {
                ++pos;
                continue;
            }
            int start = ((Annotation)tokens.get(pos)).start();
            String iLabel = "I-" + label.substring(2);
            ++pos;
            while (pos < tokens.size() && labels.get(pos).equals(iLabel)) {
                ++pos;
            }
            int end = ((Annotation)tokens.get(pos - 1)).end();
            FeatureSet attrs = new FeatureSet();
            attrs.put("TYPE", label.substring(2));
            doc.annotate("ENAMEX", new Span(start, end), attrs);
        }
    }

    public void setProperty(String key, Object value) {
        this.properties = PropertyList.add(key, value, this.properties);
    }

    protected Pipe createPipe() {
        Pipe[] pipes = new Pipe[]{new SentenceToTokenSequencePipe(), this.createFeaturePipe(), new TokenSequence2FeatureVectorSequence()};
        return new SerialPipes(pipes);
    }

    protected Pipe createFeaturePipe() {
        Pipe[] pipes = this.features.toArray(new Pipe[this.features.size()]);
        return new SerialPipes(pipes);
    }

    protected void addFeatures() {
        this.addFeature(new FirstWordFeature("FIRST_WORD"));
        this.addFeature(new NumericalFeatures("NUMERICAL"));
        this.addFeature(new RegexpMatchFeature("INITCAP", "\\p{Lu}.*"));
        this.addFeature(new RegexpMatchFeature("CAPITALIZED", "\\p{Lu}\\p{Ll}*"));
        this.addFeature(new RegexpMatchFeature("ALLCAPS", "\\p{Lu}+"));
        this.addFeature(new RegexpMatchFeature("ALLDIGITS", "[0-9]+"));
        this.addFeature(new RegexpMatchFeature("TWO_DIGITS", "[0-9]{2}"));
        this.addFeature(new RegexpMatchFeature("FOUR_DIGITS", "[0-9]{4}"));
        this.addFeature(new RegexpMatchFeature("MORETHANFOURDIGITS", "[0-9]{5,}"));
        this.addFeature(new RegexpMatchFeature("ROMAN_NUMBER", "[IXV]+"));
        this.addFeature(new RegexpMatchFeature("CAPITALANDDIGIT", "[A-Z0-9]+"));
        this.addFeature(new RegexpMatchFeature("YEAR_DECADE", "(?:[0-9]{2})?[0-9]{2}s"));
        this.addFeature(new RegexpMatchFeature("MIXEDCAPS", "\\p{Lu}\\p{Ll}+\\p{Lu}.*"));
        this.addFeature(new RegexpMatchFeature("MULTIDOT", "\\.\\.+"));
        this.addFeature(new RegexpMatchFeature("ENDSINDOT", "[^\\.].*\\."));
        this.addFeature(new RegexpMatchFeature("CONTAINSDASH", "\\w+-\\w*"));
        this.addFeature(new RegexpMatchFeature("ACRONYM", "\\p{Lu}[\\p{Lu}\\.]\\.[\\p{Lu}\\.]"));
        this.addFeature(new RegexpMatchFeature("CAP_OTHER_PERIOD", "[A-Z].+\\."));
        this.addFeature(new RegexpMatchFeature("CAP_PERIOD", "[A-Z]\\."));
        this.addFeature(new RegexpMatchFeature("SINGLECHAR", "."));
        this.addFeature(new RegexpMatchFeature("CAPLETTER", "[A-Z]"));
        this.addFeature(new RegexpMatchFeature("PUNCTUATION", PUNCTUATIONS));
        this.addFeature(new RegexpMatchFeature("QUOTE", QUOTES));
        this.addFeature(new AlphaFeature("a="));
        this.addFeature(new NonAlphaFeature("A="));
        this.addFeature(new PatternFeature("p="));
        this.addFeature(new SummarizedPatternFeature("P="));
        this.addFeature(new TokenText("W="));
        this.addFeature(new TokenLowerText("w="));
        this.addFeature(new RegexpMatchFeature("CURRENCY", "\\p{Sc}"));
        this.addFeature(new LexiconCategoryFeature("CAT="));
        this.addFeature(new RegexpMatchFeature("OPEN_PAREN", OPEN_PAREN));
        this.addFeature(new RegexpMatchFeature("CLOSE_PAREN", CLOSE_PAREN));
        this.addFeature(new NamedEntityInDictionaryFeature("NE="));
    }

    public void addFeature(Pipe feature) {
        this.features.add(feature);
    }

    public void writeModel(OutputStream out) throws IOException {
        ObjectOutputStream objOut = new ObjectOutputStream(out);
        objOut.writeObject(this.crf);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void writeModel(File file) throws IOException {
        FileOutputStream out = new FileOutputStream(file);
        try {
            this.writeModel(out);
        }
        finally {
            IOUtils.closeQuietly(out);
        }
    }

    public void readModel(InputStream in) throws IOException, ClassNotFoundException {
        ObjectInputStream objIn = new ObjectInputStream(in);
        this.crf = (CRF3)objIn.readObject();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void readModel(File file) throws IOException, ClassNotFoundException {
        FileInputStream in = new FileInputStream(file);
        try {
            this.readModel(in);
        }
        finally {
            IOUtils.closeQuietly(in);
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 3) {
            CRFNameTagger.usage();
        }
        File modelFile = new File(args[1]);
        File list = new File(args[2]);
        EnglishLex.readLexicon("data/Jet4.dict");
        if (args[0].equals("train")) {
            if (args.length != 3) {
                CRFNameTagger.usage();
            }
            CRFNameTagger.train(modelFile, list);
        } else if (args[0].equals("test")) {
            if (args.length != 4) {
                CRFNameTagger.usage();
            }
            File outDir = new File(args[3]);
            CRFNameTagger.test(modelFile, list, outDir);
        } else {
            CRFNameTagger.usage();
        }
    }

    private static void usage() {
        System.err.printf("usage: java %s train|test args", CRFNameTagger.class.getName());
        System.err.println();
        System.err.println();
        System.err.println("train parameters: ");
        System.err.println("    modelFilename targetDirectory");
        System.err.println();
        System.err.println("test parameters: ");
        System.err.println("    modelFilename targetDirectory outputDirectory");
        System.exit(1);
    }

    private static void train(File modelFile, File list) throws IOException, ParserConfigurationException, SAXException {
        Collection<Document> docs = CRFNameTagger.loadDocumentCollection(list);
        CRFNameTagger.prepareDocuments(docs);
        Dictionary dict = CRFNameTagger.loadDictionary();
        CRFNameTagger tagger = new CRFNameTagger();
        tagger.setProperty("dictionary", dict);
        tagger.train(docs);
        tagger.writeModel(modelFile);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static void test(File modelFile, File list, File outDir) throws IOException, ParserConfigurationException, SAXException, ClassNotFoundException {
        Collection<Document> docs = CRFNameTagger.loadDocumentCollection(list);
        CRFNameTagger.prepareDocuments(docs);
        Dictionary dict = CRFNameTagger.loadDictionary();
        CRFNameTagger tagger = new CRFNameTagger();
        tagger.setProperty("dictionary", dict);
        tagger.readModel(modelFile);
        InstanceList testingData = new InstanceList(tagger.crf.getInputPipe());
        for (Document doc : docs) {
            Vector<Annotation> sentences = doc.annotationsOfType("sentence");
            for (Annotation sentence : sentences) {
                Instance carrier = new Instance(sentence.span(), null, "sentence", doc);
                carrier.setPropertyList(tagger.properties);
                carrier = carrier.getPipedCopy(tagger.crf.getInputPipe());
                testingData.add(carrier);
            }
        }
        PrintStream out = null;
        try {
            out = new PrintStream(new File(outDir, "tokens.txt"));
            for (int i = 0; i < testingData.size(); ++i) {
                Instance carrier = testingData.getInstance(i);
                TokenSequence tokens = (TokenSequence)carrier.getSource();
                Sequence expected = (Sequence)carrier.getTarget();
                Sequence input = (Sequence)carrier.getData();
                Sequence actual = tagger.crf.transduce(input);
                assert (tokens.size() == actual.size());
                assert (tokens.size() == expected.size());
                for (int j = 0; j < tokens.size(); ++j) {
                    out.printf("%-20s %15s %15s", tokens.getToken(j).getText(), expected.get(j), actual.get(j));
                    out.println();
                }
                out.println();
            }
        }
        catch (Throwable throwable) {
            IOUtils.closeQuietly(out);
            throw throwable;
        }
        IOUtils.closeQuietly(out);
        Evaluator evaluator = new Evaluator();
        for (Document gold : docs) {
            Document system = new Document(gold);
            system.removeAnnotationsOfType("ENAMEX");
            Vector<Annotation> sentences = system.annotationsOfType("sentence");
            for (Annotation sentence : sentences) {
                tagger.annotate(system, sentence.span());
            }
            evaluator.evaluate(system, gold);
        }
        System.out.printf("%-15s\t%10s\t%10s", "type", "precision", "recall");
        System.out.println();
        for (String type : evaluator.getTypes()) {
            double precision = evaluator.getPrecision(type);
            double recall = evaluator.getRecall(type);
            System.out.printf("%-15s\t%10.2f\t%10.2f", type, precision, recall);
            System.out.println();
        }
        System.out.printf("%-15s\t%10.2f\t%10.2f", "TOTAL", evaluator.getPrecision(), evaluator.getRecall());
        System.out.println();
        for (Document doc : docs) {
            doc.removeAnnotationsOfType("ENAMEX");
        }
        for (Document doc : docs) {
            Vector<Annotation> sentences = doc.annotationsOfType("sentence");
            for (Annotation sentence : sentences) {
                tagger.annotate(doc, sentence.span());
            }
            String id = CRFNameTagger.getId(doc);
            doc.removeAnnotationsOfType("token");
            doc.setSGMLwrapMargin(0);
            File docOutFile = new File(outDir, id + ".sgm");
            BufferedWriter docOut = null;
            try {
                docOut = new BufferedWriter(new FileWriter(docOutFile));
                docOut.append(doc.writeSGML(null));
            }
            catch (Exception ex) {
                try {
                    throw new RuntimeException(ex);
                }
                catch (Throwable throwable) {
                    IOUtils.closeQuietly(docOut);
                    throw throwable;
                }
            }
            IOUtils.closeQuietly(docOut);
        }
    }

    private static String getId(Document doc) {
        Vector<Annotation> docId = doc.annotationsOfType("DOCNO");
        if (docId != null && docId.size() != 0) {
            return doc.normalizedText((Annotation)docId.get(0));
        }
        docId = doc.annotationsOfType("DOCID");
        if (docId != null && docId.size() != 0) {
            return doc.normalizedText((Annotation)docId.get(0));
        }
        return null;
    }

    private static Collection<Document> loadDocumentCollection(File listFile) {
        DocumentCollection collection = new DocumentCollection(listFile.getPath());
        ArrayList<Document> docs = new ArrayList<Document>();
        if (!collection.open()) {
            return null;
        }
        for (int i = 0; i < collection.size(); ++i) {
            ExternalDocument doc = collection.get(i);
            doc.setAllTags(true);
            if (!doc.open()) {
                return null;
            }
            docs.add(doc);
        }
        return docs;
    }

    private static void prepareDocuments(Collection<Document> docs) {
        for (Document doc : docs) {
            SpecialZoner.findSpecialZones(doc);
            Vector<Annotation> textSegments = doc.annotationsOfType("TEXT");
            for (Annotation text : textSegments) {
                SentenceSplitter.split(doc, text.span());
            }
            Vector<Annotation> sentences = doc.annotationsOfType("sentence");
            for (Annotation sentence : sentences) {
                Tokenizer.tokenize(doc, sentence.span());
            }
            doc.removeAnnotationsOfType("textBreak");
            doc.removeAnnotationsOfType("dateline");
        }
    }

    private static Dictionary loadDictionary() throws IOException {
        TrieDictionary dict = new TrieDictionary("data/wsj.ned.da", "data/wsj.ned.cdb");
        return dict;
    }
}

