/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.hmm;

import edu.nyu.jet.JetTest;
import edu.nyu.jet.aceJet.Ace;
import edu.nyu.jet.hmm.BigramHMMemitter;
import edu.nyu.jet.hmm.HMM;
import edu.nyu.jet.hmm.HMMannotator;
import edu.nyu.jet.hmm.HMMemitter;
import edu.nyu.jet.hmm.HMMstate;
import edu.nyu.jet.hmm.WordFeatureHMMemitter;
import edu.nyu.jet.lex.Tokenizer;
import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.scorer.NEScorer;
import edu.nyu.jet.scorer.NameTagger;
import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.AnnotationColor;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.DocumentCollection;
import edu.nyu.jet.tipster.ExternalDocument;
import edu.nyu.jet.tipster.Span;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.zoner.SpecialZoner;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Vector;

public class HMMNameTagger
implements NameTagger {
    public HMM nameHMM;
    public HMMannotator annotator;
    String[][] tagTable;
    String[] NEtypeTable;
    String[] tagsToRead;
    String[] tagsToCache;
    static String[] tagsToScore;
    Class emitterClass;
    static final String home = "C:/Documents and Settings/Ralph Grishman/My Documents/";
    static final String ACEdir = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/V4/";
    static boolean useAceBigrams;

    public HMMNameTagger(Class emitterClass) {
        if (!HMMemitter.class.isAssignableFrom(emitterClass)) {
            System.out.println("HMMNameTagger constructor invoked with invalid class " + emitterClass);
            return;
        }
        this.emitterClass = emitterClass;
        this.nameHMM = new HMM(emitterClass);
        this.annotator = new HMMannotator(this.nameHMM);
        this.annotator.setBItag(false);
        this.annotator.setAnnotateEachToken(false);
    }

    private void readTagTable(String tagFileName) {
        try {
            BufferedReader in = new BufferedReader(new FileReader(tagFileName));
            this.readTagTable(in);
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void readTagTable(BufferedReader in) {
        this.annotator.readTagTable(in);
        HashSet<String> annotationTypes = new HashSet<String>();
        ArrayList<String> hmmTagList = new ArrayList<String>();
        String[][] tagTable = this.annotator.getTagTable();
        for (int i = 0; i < tagTable.length; ++i) {
            annotationTypes.add(tagTable[i][0]);
            hmmTagList.add(tagTable[i][3]);
        }
        this.NEtypeTable = hmmTagList.toArray(new String[0]);
        this.tagsToCache = this.NEtypeTable;
        this.nameHMM.setTagsToCache(this.tagsToCache);
        tagsToScore = new String[]{"ENAMEX"};
        annotationTypes.add("SENT");
        annotationTypes.add("TURN");
        this.tagsToRead = new String[]{"ENAMEX", "VALUE", "TIMEX"};
    }

    private void writeTagTable(BufferedWriter bw) {
        this.annotator.writeTagTable(bw);
    }

    public void buildNameHMM(String tagFileName) {
        this.readTagTable(tagFileName);
        HMMstate startState = new HMMstate("start", "", this.emitterClass);
        this.nameHMM.addState(startState);
        startState.addArc("other");
        startState.addArc("end");
        for (int j = 0; j < this.NEtypeTable.length; ++j) {
            startState.addArc("pre-" + this.NEtypeTable[j]);
            startState.addArc("i-" + this.NEtypeTable[j]);
            startState.addArc("b-" + this.NEtypeTable[j]);
        }
        HMMstate otherState = new HMMstate("other", "other", this.emitterClass);
        this.nameHMM.addState(otherState);
        otherState.addArc("other");
        otherState.addArc("end");
        for (int j = 0; j < this.NEtypeTable.length; ++j) {
            otherState.addArc("pre-" + this.NEtypeTable[j]);
        }
        HMMstate endState = new HMMstate("end", "", this.emitterClass);
        this.nameHMM.addState(endState);
        for (int i = 0; i < this.NEtypeTable.length; ++i) {
            String NEtype = this.NEtypeTable[i];
            HMMstate preState = new HMMstate("pre-" + NEtype, "other", this.emitterClass);
            this.nameHMM.addState(preState);
            preState.addArc("i-" + NEtype);
            preState.addArc("b-" + NEtype);
            HMMstate iState = new HMMstate("i-" + NEtype, NEtype, this.emitterClass);
            this.nameHMM.addState(iState);
            HMMstate bState = new HMMstate("b-" + NEtype, NEtype, this.emitterClass);
            this.nameHMM.addState(bState);
            bState.addArc("m-" + NEtype);
            bState.addArc("e-" + NEtype);
            HMMstate mState = new HMMstate("m-" + NEtype, NEtype, this.emitterClass);
            this.nameHMM.addState(mState);
            mState.addArc("m-" + NEtype);
            mState.addArc("e-" + NEtype);
            HMMstate eState = new HMMstate("e-" + NEtype, NEtype, this.emitterClass);
            this.nameHMM.addState(eState);
            HMMstate postState = new HMMstate("post-" + NEtype, "other", this.emitterClass);
            this.nameHMM.addState(postState);
            for (int j = 0; j < this.NEtypeTable.length; ++j) {
                iState.addArc("pre-" + this.NEtypeTable[j]);
                eState.addArc("pre-" + this.NEtypeTable[j]);
                postState.addArc("pre-" + this.NEtypeTable[j]);
                if (i == j) continue;
                iState.addArc("i-" + this.NEtypeTable[j]);
                iState.addArc("b-" + this.NEtypeTable[j]);
                eState.addArc("i-" + this.NEtypeTable[j]);
                eState.addArc("b-" + this.NEtypeTable[j]);
            }
            iState.addArc("post-" + NEtype);
            iState.addArc("end");
            eState.addArc("post-" + NEtype);
            eState.addArc("end");
            postState.addArc("other");
            postState.addArc("end");
        }
        this.nameHMM.resolveNames();
        this.nameHMM.resetForTraining();
    }

    public void train(String trainingCollection) throws IOException {
        DocumentCollection trainCol = new DocumentCollection(trainingCollection);
        trainCol.open();
        for (int i = 0; i < trainCol.size(); ++i) {
            ExternalDocument doc = trainCol.get(i);
            System.out.println("\nTraining from document " + (i + 1) + ": " + doc.fileName());
            this.train(doc);
        }
        this.nameHMM.computeProbabilities();
    }

    public void train(String directory, String fileList) throws IOException {
        String currentDoc;
        BufferedReader reader = new BufferedReader(new FileReader(fileList));
        int docCount = 0;
        while ((currentDoc = reader.readLine()) != null) {
            System.out.println("\nTraining from document " + ++docCount + ": " + currentDoc);
            String neFileName = directory + currentDoc;
            ExternalDocument doc = new ExternalDocument("sgml", neFileName);
            this.train(doc);
        }
        this.nameHMM.computeProbabilities();
    }

    public void train(ExternalDocument doc) throws IOException {
        doc.setAllTags(true);
        doc.open();
        doc.stretchAll();
        doc.annotateWithTag("TEXT");
        SpecialZoner.findSpecialZones(doc);
        this.nameHMM.newDocument();
        Vector<Annotation> textSegments = doc.annotationsOfType("TEXT");
        if (textSegments == null) {
            System.out.println("No <TEXT> in " + doc.fileName() + ", skipped.");
            return;
        }
        for (Annotation ann : textSegments) {
            Span textSpan = ann.span();
            Ace.monocase = Ace.allLowerCase(doc);
            System.out.println(">>> Monocase is " + Ace.monocase);
            SentenceSplitter.split(doc, textSpan);
        }
        Vector<Annotation> sentences = doc.annotationsOfType("sentence");
        if (sentences == null) {
            return;
        }
        for (Annotation sentence : sentences) {
            Span sentenceSpan = sentence.span();
            Ace.monocase = Ace.allLowerCase(doc, sentenceSpan) || Ace.titleCase(doc, sentenceSpan);
            Tokenizer.tokenize(doc, sentenceSpan);
            this.annotator.trainOnSpan(doc, sentenceSpan);
        }
        doc.clearAnnotations();
    }

    public void store(String fileName) throws IOException {
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(fileName), JetTest.encoding));
        this.writeTagTable(bw);
        bw.write("endtags");
        bw.newLine();
        this.nameHMM.store(new PrintWriter(bw));
    }

    public void load(String fileName) throws IOException {
        BufferedReader in = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(fileName), JetTest.encoding));
        this.readTagTable(in);
        this.nameHMM.load(in);
    }

    public void tagDocument(Document doc) {
        this.nameHMM.newDocument();
        doc.annotateWithTag("TEXT");
        SpecialZoner.findSpecialZones(doc);
        Vector<Annotation> textSegments = doc.annotationsOfType("TEXT");
        for (Annotation ann : textSegments) {
            Span textSpan = ann.span();
            Ace.monocase = Ace.allLowerCase(doc);
            System.out.println(">>> Monocase is " + Ace.monocase);
            SentenceSplitter.split(doc, textSpan);
        }
        Vector<Annotation> sentences = doc.annotationsOfType("sentence");
        for (Annotation sentence : sentences) {
            Span sentenceSpan = sentence.span();
            Ace.monocase = Ace.allLowerCase(doc, sentenceSpan) || Ace.titleCase(doc, sentenceSpan);
            Tokenizer.tokenize(doc, sentenceSpan);
            this.tag(doc, sentenceSpan);
        }
    }

    public void newDocument() {
        this.nameHMM.newDocument();
    }

    public void tag(Document doc, Span span) {
        if (HMMNameTagger.inZone(doc, span, "POSTER") || HMMNameTagger.inZone(doc, span, "SPEAKER")) {
            HMMNameTagger.tagPersonZone(doc, span, this.annotator);
        } else {
            this.annotator.annotateSpan(doc, span);
        }
    }

    public static boolean inZone(Document doc, Span span, String zoneType) {
        int posn;
        String text = doc.text();
        int end = span.end();
        for (posn = span.start(); posn < end && Character.isWhitespace(text.charAt(posn)); ++posn) {
        }
        Vector<Annotation> zones = doc.annotationsOfType(zoneType);
        if (zones == null) {
            return false;
        }
        for (int i = 0; i < zones.size(); ++i) {
            Annotation zone = zones.get(i);
            Span zoneSpan = zone.span();
            if (posn < zoneSpan.start() || posn >= zoneSpan.end()) continue;
            return true;
        }
        return false;
    }

    public static void tagPersonZone(Document doc, Span span, HMMannotator annotator) {
        int comma;
        int start;
        String text = doc.text();
        int end = span.end();
        for (start = span.start(); start < end && Character.isWhitespace(text.charAt(start)); ++start) {
        }
        for (comma = start; comma < end && text.charAt(comma) != ','; ++comma) {
        }
        if (comma >= end) {
            Span s = new Span(start, end);
            doc.annotate("ENAMEX", s, new FeatureSet("TYPE", "PERSON"));
        } else {
            Span sName = new Span(start, comma);
            Span sRest = new Span(comma, end);
            if (comma > start) {
                doc.annotate("ENAMEX", sName, new FeatureSet("TYPE", "PERSON"));
            }
            if (end > comma) {
                annotator.annotateSpan(doc, sRest);
            }
        }
    }

    public static void main(String[] args) throws IOException {
        if (args.length < 5 || args.length % 2 == 0) {
            System.err.println("HMMNameTagger requires 3 + 2n arguments for n training corpora:");
            System.err.println("  state-file model-file uni/bigram directory1 filelist1 [directory2 filelist2] ...");
            System.exit(1);
        }
        new AnnotationColor(ACEdir);
        String stateFile = args[0];
        String modelFile = args[1];
        useAceBigrams = args[2].equals("bigram");
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        nt.buildNameHMM(stateFile);
        for (int iarg = 3; iarg < args.length; iarg += 2) {
            String directory = args[iarg];
            if (!directory.endsWith("/")) {
                directory = directory + "/";
            }
            String fileList = args[iarg + 1];
            nt.train(directory, fileList);
        }
        nt.store(modelFile);
    }

    static void aceTrainTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        nt.buildNameHMM("acedata/ACEnameTags.txt");
        String trainingCollection1 = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE BBN Collection.txt";
        String trainingCollection2 = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE training Collection.txt";
        String trainingCollection3 = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE aug03 Collection.txt";
        String trainingCollection4 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 nwire 21andup ne.txt";
        String trainingCollection5 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 bnews 21andup ne.txt";
        nt.train(trainingCollection1);
        nt.train(trainingCollection2);
        nt.train(trainingCollection4);
        nt.train(trainingCollection5);
        if (useAceBigrams) {
            nt.store("acedata/ACEname04bigramHMM.txt");
        } else {
            nt.store("acedata/ACEname04HMM.txt");
        }
        HMMNameTagger.aceTest(nt);
    }

    private static void aceLoadTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        if (useAceBigrams) {
            nt.load("acedata/ACEname04bigramHMM.txt");
        } else {
            nt.load("acedata/ACEname04HMM.txt");
        }
        HMMNameTagger.aceTest(nt);
    }

    private static void aceTest(HMMNameTagger nt) throws IOException {
        String testCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 nwire 20 sgm.txt";
        String keyCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 nwire 20 ne.txt";
        BigramHMMemitter.useBigrams = false;
        NEScorer.scoreCollection((NameTagger)nt, testCollection, keyCollection, tagsToScore);
    }

    static void ace05TrainTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        nt.buildNameHMM("acedata/ACE05nameTags.txt");
        String trainingCollection1 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/V4/NE/tailNE.txt";
        String trainingCollection4 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 nwire 21andup ne.txt";
        String trainingCollection5 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 bnews 21andup ne.txt";
        nt.train(trainingCollection1);
        nt.train(trainingCollection1);
        nt.train(trainingCollection4);
        nt.train(trainingCollection5);
        if (useAceBigrams) {
            nt.store("acedata/ACEname05bigramHMM.txt");
        } else {
            nt.store("acedata/ACEname05HMM.txt");
        }
        HMMNameTagger.ace05Test(nt);
    }

    private static void ace05LoadTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        if (useAceBigrams) {
            nt.load("acedata/ACEname05bigramHMM.txt");
        } else {
            nt.load("acedata/ACEname05HMM.txt");
        }
        HMMNameTagger.ace05Test(nt);
    }

    private static void ace05Test(HMMNameTagger nt) throws IOException {
        String testCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/V4/NE/headSgm.txt";
        String keyCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/V4/NE/headNE.txt";
        BigramHMMemitter.useBigrams = false;
        NEScorer.scoreCollection((NameTagger)nt, testCollection, keyCollection, tagsToScore);
    }

    private static void mucTrainTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(WordFeatureHMMemitter.class);
        nt.buildNameHMM("data/MUCnameTags.txt");
        String trainingCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/NE train Collection.txt";
        nt.train(trainingCollection);
        nt.store("data/MUCnameHMM.txt");
        HMMNameTagger.mucTest(nt);
    }

    private static void mucLoadTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(WordFeatureHMMemitter.class);
        nt.buildNameHMM("data/MUCnameTags.txt");
        nt.load("data/MUCnameHMM.txt");
        HMMNameTagger.mucTest(nt);
    }

    private static void mucTest(HMMNameTagger nt) throws IOException {
        String testCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/NE test Collection.txt";
        String keyCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/NE key Collection.txt";
        NEScorer.scoreCollection((NameTagger)nt, testCollection, keyCollection, tagsToScore);
    }

    static void galeTrainTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        nt.buildNameHMM("acedata/ACE05nameTags.txt");
        String trainingCollection1 = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE BBN Collection.txt";
        String trainingCollection2 = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE training Collection.txt";
        String trainingCollection3 = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE aug03 Collection.txt";
        String trainingCollection4 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 nwire 21andup ne.txt";
        String trainingCollection5 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training04 bnews 21andup ne.txt";
        String trainingCollection6 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/V4/NE/tailNE.txt";
        String trainingCollection7 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/names/NYTfilelist.txt";
        String trainingCollection8 = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE 05/names/AFPfilelist.txt";
        String trainingCollection9 = "C:/Documents and Settings/Ralph Grishman/My Documents/Ace 07/ET/NE/filelist.sgm";
        nt.train(trainingCollection1);
        nt.train(trainingCollection2);
        nt.train(trainingCollection4);
        nt.train(trainingCollection5);
        nt.train(trainingCollection6);
        nt.train(trainingCollection6);
        nt.train(trainingCollection7);
        nt.train(trainingCollection9);
        if (useAceBigrams) {
            nt.store("acedata/AceNameBigram07HMM.txt");
        } else {
            nt.store("acedata/AceName07HMM.txt");
        }
        HMMNameTagger.ace05Test(nt);
    }

    static void galeLoadTest() throws IOException {
        HMMNameTagger nt = new HMMNameTagger(useAceBigrams ? BigramHMMemitter.class : WordFeatureHMMemitter.class);
        if (useAceBigrams) {
            nt.load("acedata/ACEname06bigramHMM.txt");
        } else {
            nt.load("acedata/ACEname06HMM.txt");
        }
        HMMNameTagger.ace05Test(nt);
    }

    static {
        useAceBigrams = false;
    }
}

