/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.aceJet;

import edu.nyu.jet.aceJet.Ace;
import edu.nyu.jet.aceJet.Gazetteer;
import edu.nyu.jet.refres.Resolve;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.TreeMap;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class APFNameAnalyzer {
    static String encoding = "ISO-8859-1";
    static HashMap startTag;
    static HashSet endTag;
    static DocumentBuilder builder;
    static final String ACEdir = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/";
    static final String fileList = "C:/Documents and Settings/Ralph Grishman/My Documents/ACE/training nwire.txt";
    static int identityCount;
    static int equalsIgnoreCaseCount;
    static int lastNameCount;
    static int lastTwoNameCount;
    static int firstNameCount;
    static int personSubseqCount;
    static int acronymCount;
    static int reverseAcronymCount;
    static int abbreviationCount;
    static int reverseAbbreviationCount;
    static int capitalCount;
    static int subseqCount;
    static int leftovers;
    static int[] ACEoffsetMap;
    static int[] JEToffsetMap;
    static HashMap standardType;
    static boolean trace;

    public static void main(String[] args) throws Exception {
        String currentDoc;
        Resolve.trace = false;
        Ace.gazetteer = new Gazetteer();
        Ace.gazetteer.load("C:/Documents and Settings/Ralph Grishman/My Documents/ACE/loc.dict");
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setValidating(false);
        builder = factory.newDocumentBuilder();
        BufferedReader reader = new BufferedReader(new FileReader(fileList));
        int docCount = 0;
        while ((currentDoc = reader.readLine()) != null) {
            System.out.println("\nProcessing document " + ++docCount + ": " + currentDoc);
            String textFileName = ACEdir + currentDoc + ".sgm";
            boolean newData = fileList.indexOf("03") > 0;
            String APFfileName = ACEdir + currentDoc + (newData ? ".apf.xml" : ".sgm.tmx.rdc.xml");
            APFNameAnalyzer.analyzeDocument(textFileName, APFfileName);
        }
        APFNameAnalyzer.report();
    }

    private static void analyzeDocument(String textFileName, String APFfileName) throws SAXException, IOException {
        Document apfDoc = builder.parse(APFfileName);
        StringBuffer fileText = APFNameAnalyzer.readDocument(textFileName);
        APFNameAnalyzer.computeOffsets(fileText);
        APFNameAnalyzer.findNames(apfDoc, fileText);
    }

    static void findNames(Document apfDoc, StringBuffer fileText) {
        startTag = new HashMap();
        endTag = new HashSet();
        NodeList entities = apfDoc.getElementsByTagName("entity");
        for (int i = 0; i < entities.getLength(); ++i) {
            Element entity = (Element)entities.item(i);
            NodeList entityTypeList = entity.getElementsByTagName("entity_type");
            Element entityType = (Element)entityTypeList.item(0);
            String type = APFNameAnalyzer.getElementText(entity, "entity_type");
            ArrayList priorNames = new ArrayList();
            NodeList names = entity.getElementsByTagName("name");
            TreeMap<Integer, String> nameStart = new TreeMap<Integer, String>();
            for (int j = 0; j < names.getLength(); ++j) {
                Element name = (Element)names.item(j);
                String startS = APFNameAnalyzer.getElementText(name, "start");
                int start = Integer.parseInt(startS);
                int startJet = JEToffsetMap[start];
                String endS = APFNameAnalyzer.getElementText(name, "end");
                int end = Integer.parseInt(endS);
                int endJet = JEToffsetMap[end];
                String nameString = fileText.substring(startJet, endJet + 1);
                nameStart.put(new Integer(startJet), nameString);
            }
            for (String nameString : nameStart.values()) {
                APFNameAnalyzer.analyzeNames(priorNames, nameString, type);
            }
        }
    }

    private static String getElementText(Element e, String elementType) {
        NodeList typeList = e.getElementsByTagName(elementType);
        Element typeElement = (Element)typeList.item(0);
        String text = typeElement.getFirstChild().getNodeValue();
        return text;
    }

    static StringBuffer readDocument(String fileName) throws IOException {
        String line;
        File file = new File(fileName);
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(file), encoding));
        StringBuffer fileText = new StringBuffer();
        while ((line = reader.readLine()) != null) {
            fileText.append(line + "\n");
        }
        return fileText;
    }

    static void computeOffsets(StringBuffer fileText) {
        boolean inTag = false;
        int xmlCount = 0;
        int length = fileText.length();
        ACEoffsetMap = new int[length];
        JEToffsetMap = new int[length];
        for (int i = 0; i < length; ++i) {
            if (fileText.charAt(i) == '<') {
                inTag = true;
            }
            APFNameAnalyzer.JEToffsetMap[i - xmlCount] = i;
            if (inTag) {
                ++xmlCount;
            }
            APFNameAnalyzer.ACEoffsetMap[i] = i - xmlCount;
            if (fileText.charAt(i) != '>') continue;
            inTag = false;
        }
    }

    static void analyzeNames(ArrayList priorNames, String currentName, String type) {
        String priorName;
        int i;
        String[] tokens = Gazetteer.splitAtWS(currentName);
        tokens = Resolve.normalizeGazName(tokens, false, false);
        currentName = Resolve.concat(tokens);
        priorNames.add(currentName);
        Object[] currentCountry = Ace.gazetteer.capitalToCountry(tokens);
        for (i = 0; i < priorNames.size() - 1; ++i) {
            priorName = (String)priorNames.get(i);
            if (!currentName.equals(priorName)) continue;
            ++identityCount;
            return;
        }
        for (i = priorNames.size() - 2; i >= 0; --i) {
            priorName = (String)priorNames.get(i);
            Object[] priorNameTokens = Gazetteer.splitAtWS(priorName);
            if (currentName.equalsIgnoreCase(priorName)) {
                ++equalsIgnoreCaseCount;
                return;
            }
            if ((type.equals("PER") || type.equals("PERSON")) && tokens.length == 1 && currentName.equals(priorNameTokens[priorNameTokens.length - 1])) {
                ++lastNameCount;
                return;
            }
            if ((type.equals("PER") || type.equals("PERSON")) && tokens.length == 2 && priorNameTokens.length > 2 && tokens[0].equalsIgnoreCase(priorNameTokens[priorNameTokens.length - 2]) && tokens[1].equals(priorNameTokens[priorNameTokens.length - 1])) {
                ++lastTwoNameCount;
                return;
            }
            if ((type.equals("PER") || type.equals("PERSON")) && tokens.length == 1 && currentName.equals(priorNameTokens[0])) {
                ++firstNameCount;
                return;
            }
            if ((type.equals("PER") || type.equals("PERSON")) && Resolve.matchFullName(tokens, "", (String[])priorNameTokens, "") >= 0) {
                ++personSubseqCount;
                return;
            }
            if ((type.equals("ORG") || type.equals("ORGANIZATION") || type.equals("GPE")) && tokens.length == 1 && APFNameAnalyzer.isAcronym((String[])priorNameTokens, currentName)) {
                ++acronymCount;
                return;
            }
            if ((type.equals("ORG") || type.equals("ORGANIZATION") || type.equals("GPE")) && priorNameTokens.length == 1 && APFNameAnalyzer.isAcronym(tokens, priorName)) {
                ++reverseAcronymCount;
                return;
            }
            if ((type.equals("ORG") || type.equals("ORGANIZATION") || type.equals("GPE")) && tokens.length == 1 && APFNameAnalyzer.isAbbreviation((String[])priorNameTokens, currentName)) {
                ++abbreviationCount;
                return;
            }
            if ((type.equals("ORG") || type.equals("ORGANIZATION") || type.equals("GPE")) && priorNameTokens.length == 1 && Resolve.isAbbreviation(tokens, priorName) == 0) {
                ++reverseAbbreviationCount;
                return;
            }
            if (type.equals("GPE") && currentCountry != null && Resolve.equalArray(currentCountry, priorNameTokens)) {
                ++capitalCount;
                return;
            }
            if (type.equals("PER") || type.equals("PERSON") || Resolve.matchFullName(tokens, "", (String[])priorNameTokens, "") < 0) continue;
            ++subseqCount;
            return;
        }
        if (priorNames.size() > 1) {
            System.out.println(currentName + " is alias of " + priorNames.get(0));
            ++leftovers;
        }
    }

    static void report() {
        System.out.println("Coreference counts:");
        System.out.println("  identity:              " + identityCount);
        System.out.println("  identityIgnoringCase:  " + equalsIgnoreCaseCount);
        System.out.println("  last name:             " + lastNameCount);
        System.out.println("  last two names:        " + lastTwoNameCount);
        System.out.println("  first name:            " + firstNameCount);
        System.out.println("  other subseq (person): " + personSubseqCount);
        System.out.println("  acronym:               " + acronymCount);
        System.out.println("  name follows acronym:  " + reverseAcronymCount);
        System.out.println("  abbreviation:          " + abbreviationCount);
        System.out.println("  name follows abbrev.:  " + reverseAbbreviationCount);
        System.out.println("  capital of country     " + capitalCount);
        System.out.println("  subseq. (not person):  " + subseqCount);
        System.out.println("  other:                 " + leftovers);
    }

    public static boolean isAcronym(String[] name, String acronym) {
        if (name.length < 2 || acronym.length() < 2) {
            return false;
        }
        int iacr = 0;
        for (int i = 0; i < name.length; ++i) {
            if (name[i].equalsIgnoreCase("the") || name[i].equalsIgnoreCase("of") || name[i].equalsIgnoreCase("for") || name[i].equalsIgnoreCase("and")) continue;
            if (iacr < acronym.length() && name[i].charAt(0) == acronym.charAt(iacr)) {
                ++iacr;
                continue;
            }
            return false;
        }
        if (trace) {
            System.out.println("Refres: recognizing " + acronym + " as acronym of " + Resolve.concat(name));
        }
        return true;
    }

    public static boolean isAbbreviation(String[] name, String abbrev) {
        if (name.length < 2 || abbrev.length() < 4 || abbrev.length() % 2 == 1) {
            return false;
        }
        int iabr = 0;
        for (int i = 0; i < name.length; ++i) {
            if (name[i].equalsIgnoreCase("the") || name[i].equalsIgnoreCase("of") || name[i].equalsIgnoreCase("for") || name[i].equalsIgnoreCase("and")) continue;
            if (iabr < abbrev.length() - 1 && name[i].charAt(0) == abbrev.charAt(iabr) && abbrev.charAt(iabr + 1) == '.') {
                iabr += 2;
                continue;
            }
            return false;
        }
        if (trace) {
            System.out.println("Refres: recognizing " + abbrev + " as abbreviation of " + Resolve.concat(name));
        }
        return true;
    }

    static {
        identityCount = 0;
        equalsIgnoreCaseCount = 0;
        lastNameCount = 0;
        lastTwoNameCount = 0;
        firstNameCount = 0;
        personSubseqCount = 0;
        acronymCount = 0;
        reverseAcronymCount = 0;
        abbreviationCount = 0;
        reverseAbbreviationCount = 0;
        capitalCount = 0;
        subseqCount = 0;
        leftovers = 0;
        ACEoffsetMap = null;
        JEToffsetMap = null;
        standardType = new HashMap();
        standardType.put("GSP", "GPE");
        standardType.put("PER", "PERSON");
        standardType.put("ORG", "ORGANIZATION");
        standardType.put("LOC", "LOCATION");
        standardType.put("FAC", "FACILITY");
        trace = false;
    }
}

