/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.aceJet;

import edu.nyu.jet.JetTest;
import edu.nyu.jet.aceJet.Ace;
import edu.nyu.jet.aceJet.AceDocument;
import edu.nyu.jet.aceJet.AceEntity;
import edu.nyu.jet.aceJet.AceEntityMention;
import edu.nyu.jet.aceJet.AceEntityName;
import edu.nyu.jet.aceJet.AceTimex;
import edu.nyu.jet.aceJet.AceTimexMention;
import edu.nyu.jet.aceJet.Gazetteer;
import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.refres.Resolve;
import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.ExternalDocument;
import edu.nyu.jet.tipster.Span;
import edu.nyu.jet.zoner.SentenceSplitter;
import edu.nyu.jet.zoner.SpecialZoner;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.Vector;

public class APFtoXML {
    static String fileList;
    static String ACEdir;
    static String outputDir;
    static String year;
    static String apfExtension;
    static String outputExtension;
    static Set<String> flags;
    static int docCount;
    static Gazetteer gazetteer;
    static TreeSet<String> unknownPre;
    static HashMap<String, String> preDict;

    public static void main(String[] args) throws IOException {
        APFtoXML.init(args);
        APFtoXML.processFileList(fileList);
        if (year.equals("2004")) {
            System.out.println("\nUnclassified items:  " + unknownPre.size());
            for (String word : unknownPre) {
                System.out.println(word);
            }
        }
    }

    public static void init(String[] args) throws IOException {
        if (args.length == 0) {
            APFtoXML.argErr();
        }
        JetTest.encoding = "UTF-8";
        year = args[0];
        AceDocument.ace2004 = false;
        AceDocument.ace2005 = false;
        int requiredArgs = 6;
        if (!year.equals("2002") && !year.equals("2003")) {
            if (year.equals("2004")) {
                requiredArgs = 8;
                String gazFile = args[6];
                String preDict = args[7];
                gazetteer = new Gazetteer();
                gazetteer.load(gazFile);
                APFtoXML.loadPreDict(preDict);
                AceDocument.ace2004 = true;
            } else if (year.equals("2005")) {
                AceDocument.ace2004 = true;
                AceDocument.ace2005 = true;
            } else {
                System.err.println("Invalid year:  must be 2002-2005");
                System.exit(1);
            }
        }
        if (args.length <= requiredArgs) {
            APFtoXML.argErr();
        }
        if (!(ACEdir = args[1]).endsWith("/")) {
            ACEdir = ACEdir + "/";
        }
        if (!(outputDir = args[2]).endsWith("/")) {
            outputDir = outputDir + "/";
        }
        fileList = args[3];
        apfExtension = args[4];
        outputExtension = args[5];
        for (int i = requiredArgs; i < args.length; ++i) {
            APFtoXML.setFlag(args[i]);
        }
    }

    public static void setFlag(String flag) {
        if (flag.equals("sentences") || flag.equals("timex") || flag.equals("mentions") || flag.equals("extents") || flag.equals("types") || flag.equals("names")) {
            flags.add(flag);
        } else {
            System.err.println("APFtoXML:  invalid flag");
            System.err.println("possible flags:  sentences timex mentions extents types names");
            System.exit(1);
        }
    }

    public static void clearFlags() {
        flags.clear();
    }

    private static void argErr() {
        System.err.println("APFtoXML arguments:");
        System.err.println("  year apf-directory  output-directory  filelist apf-extension output-extension [gazetteer pre-dictionary] flag ...");
        System.err.println("gazetteer and pre-dictionary needed for year = 2004");
        System.err.println("possible flags:  sentences timex mentions extents types names");
        System.exit(1);
    }

    private static void loadPreDict(String dictFile) {
        try {
            String line;
            BufferedReader reader = new BufferedReader(new FileReader(dictFile));
            while ((line = reader.readLine()) != null) {
                String preType = line.substring(0, 1);
                String word = line.substring(2);
                preDict.put(word, preType);
            }
        }
        catch (IOException e) {
            System.err.print("Unable to load dictionary due to exception: ");
            System.err.println(e);
        }
    }

    private static void processFileList(String fileList) throws IOException {
        String currentDoc;
        BufferedReader reader = new BufferedReader(new FileReader(fileList));
        while ((currentDoc = reader.readLine()) != null) {
            APFtoXML.processFileAndCatchError(currentDoc);
        }
    }

    public static void processFileAndCatchError(String currentDoc) {
        try {
            APFtoXML.processFile(currentDoc);
        }
        catch (Exception e) {
            System.err.println("Error: " + e.toString());
            e.printStackTrace();
        }
    }

    public static void processFile(String docName) {
        System.out.println("\nProcessing document " + ++docCount + ": " + docName);
        String textFileName = ACEdir + docName + ".sgm";
        ExternalDocument doc = new ExternalDocument("sgml", textFileName);
        doc.setAllTags(true);
        if (year.equals("2003") || year.equals("2004")) {
            doc.setEmptyTags(new String[]{"TURN"});
        }
        doc.open();
        String APFfileName = ACEdir + docName + "." + apfExtension;
        AceDocument aceDoc = new AceDocument(textFileName, APFfileName);
        APFtoXML.addAnnotations(doc, aceDoc);
        doc.setSGMLwrapMargin(0);
        doc.saveAs(outputDir, docName + "." + outputExtension);
    }

    public static String processDocument(Document doc, AceDocument aceDoc) {
        APFtoXML.addAnnotations(doc, aceDoc);
        return doc.writeSGML(null).toString();
    }

    public static void addAnnotations(Document doc, AceDocument aceDoc) {
        boolean monocase = Ace.allLowerCase(doc);
        if (year.equals("2004")) {
            gazetteer.setMonocase(monocase);
        }
        if (flags.contains("sentences")) {
            APFtoXML.addSentences(doc);
        }
        if (flags.contains("timex")) {
            APFtoXML.addTimexTags(doc, aceDoc);
        }
        if (flags.contains("mentions")) {
            APFtoXML.addMentionTags(doc, aceDoc);
        }
        if (flags.contains("names")) {
            APFtoXML.addENAMEXtags(doc, aceDoc);
        }
    }

    static void addSentences(Document doc) {
        SpecialZoner.findSpecialZones(doc);
        Vector<Annotation> textSegments = doc.annotationsOfType("TEXT");
        if (textSegments == null) {
            System.out.println("No <TEXT> in document");
            return;
        }
        for (Annotation ann : textSegments) {
            Span textSpan = ann.span();
            Ace.monocase = Ace.allLowerCase(doc);
            SentenceSplitter.split(doc, textSpan);
        }
        Vector<Annotation> sentences = doc.annotationsOfType("sentence");
        if (sentences != null) {
            int sentNo = 0;
            for (Annotation sentence : sentences) {
                sentence.put("ID", "SENT-" + ++sentNo);
            }
        }
        doc.removeAnnotationsOfType("dateline");
        doc.removeAnnotationsOfType("textBreak");
        doc.shrink("sentence");
    }

    static void addTimexTags(Document doc, AceDocument aceDoc) {
        ArrayList<AceTimex> timeExpressions = aceDoc.timeExpressions;
        for (AceTimex timex : timeExpressions) {
            AceTimexMention mention = (AceTimexMention)timex.mentions.get(0);
            Span aceSpan = mention.extent;
            Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
            FeatureSet features = new FeatureSet();
            if (timex.val != null && !timex.val.equals("")) {
                features.put("val", timex.val);
            }
            if (timex.anchorVal != null && !timex.anchorVal.equals("")) {
                features.put("anchor_val", timex.anchorVal);
            }
            if (timex.anchorDir != null && !timex.anchorDir.equals("")) {
                features.put("anchor_dir", timex.anchorDir);
            }
            if (timex.set != null && !timex.set.equals("")) {
                features.put("set", timex.set);
            }
            if (timex.mod != null && !timex.mod.equals("")) {
                features.put("mod", timex.mod);
            }
            doc.annotate("timex2", jetSpan, features);
        }
    }

    static void addENAMEXtags(Document doc, AceDocument aceDoc) {
        ArrayList<AceEntity> entities = aceDoc.entities;
        for (int i = 0; i < entities.size(); ++i) {
            AceEntity entity = entities.get(i);
            ArrayList<AceEntityName> names = entity.names;
            for (int j = 0; j < names.size(); ++j) {
                AceEntityName name = names.get(j);
                Span aceSpan = name.extent;
                Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
                doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
            }
            if (!year.equals("2004")) continue;
            ArrayList<AceEntityMention> mentions = entity.mentions;
            for (int j = 0; j < mentions.size(); ++j) {
                AceEntityMention mention = mentions.get(j);
                String htext = Resolve.normalizeName(mention.headText);
                String[] mentionName = Gazetteer.splitAtWS(htext);
                String preClass = preDict.get(htext.toLowerCase());
                if (!mention.type.equals("PRE")) continue;
                if (gazetteer.isNationality(mentionName) || gazetteer.isLocation(mentionName) || "N".equals(preClass)) {
                    Span aceSpan = mention.head;
                    Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
                    doc.annotate("ENAMEX", jetSpan, new FeatureSet("TYPE", entity.type));
                    continue;
                }
                if (preClass != null) continue;
                System.out.println("Unclassified PRE: " + mention.text + " {" + mention.headText + ")");
                unknownPre.add(htext.toLowerCase());
            }
        }
    }

    static void addMentionTags(Document doc, AceDocument aceDoc) {
        ArrayList<AceEntity> entities = aceDoc.entities;
        for (int i = 0; i < entities.size(); ++i) {
            AceEntity entity = entities.get(i);
            ArrayList<AceEntityMention> mentions = entity.mentions;
            for (int j = 0; j < mentions.size(); ++j) {
                AceEntityMention mention = mentions.get(j);
                Span aceSpan = mention.head;
                if (aceSpan.start() < 0) continue;
                Span jetSpan = new Span(aceSpan.start(), aceSpan.end() + 1);
                FeatureSet features = new FeatureSet("entity", new Integer(i));
                if (flags.contains("types")) {
                    features.put("type", entity.type.substring(0, 3));
                    if (entity.subtype != null) {
                        features.put("subtype", entity.subtype);
                    }
                }
                if (flags.contains("extents")) {
                    String cleanExtent = mention.text.replaceAll("\n", " ");
                    features.put("extent", AceEntityMention.addXmlEscapes(cleanExtent));
                }
                doc.annotate("mention", jetSpan, features);
            }
        }
    }

    static {
        year = "2005";
        flags = new HashSet<String>();
        docCount = 0;
        unknownPre = new TreeSet();
        preDict = new HashMap();
    }
}

