/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.hmm;

import edu.nyu.jet.lex.Tokenizer;
import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.DocumentCollection;
import edu.nyu.jet.tipster.ExternalDocument;
import edu.nyu.jet.tipster.Span;
import edu.nyu.jet.zoner.SentenceSplitter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Vector;

public class BIOWriter {
    private static final String home = "C:/Documents and Settings/Ralph Grishman/My Documents/";
    private static final String xmlCollection = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/ACE training Collection.txt";
    private static final String bioFile = "C:/Documents and Settings/Ralph Grishman/My Documents/HMM/NE/aceTrainingBIO.txt";
    private static final String[] tagsToRead = new String[]{"ENAMEX", "TIMEX", "NUMEX"};
    private static PrintStream writer;

    public static void main(String[] args) throws IOException {
        BIOWriter.convertCollection(xmlCollection, bioFile);
    }

    public static void task(String[] args) {
        if (args.length != 3) {
            System.out.println("BIOWriter requires 2 arguments: jet -BIOWriter <XML-collection> <BIO-file>");
            System.exit(1);
        }
        String xmlCollection = args[1];
        String bioFile = args[2];
        try {
            BIOWriter.convertCollection(xmlCollection, bioFile);
        }
        catch (IOException e) {
            System.out.println("BIOWriter IO error: " + e);
        }
    }

    public static void convertCollection(String xmlCollectionName, String bioFileName) throws IOException {
        DocumentCollection xmlCollection = new DocumentCollection(xmlCollectionName);
        writer = new PrintStream(new FileOutputStream(bioFileName));
        xmlCollection.open();
        for (int i = 0; i < xmlCollection.size(); ++i) {
            ExternalDocument doc = xmlCollection.get(i);
            System.out.println("Processing document " + doc.fileName());
            doc.setSGMLtags(tagsToRead);
            doc.open();
            doc.annotateWithTag("text");
            Span textSpan = doc.annotationsOfType("text").get(0).span();
            BIOWriter.eraseAnnotationsOutside(doc, "ENAMEX", textSpan);
            BIOWriter.eraseAnnotationsOutside(doc, "TIMEX", textSpan);
            BIOWriter.eraseAnnotationsOutside(doc, "NUMEX", textSpan);
            SentenceSplitter.split(doc, textSpan);
            Vector<Annotation> sentences = doc.annotationsOfType("sentence");
            if (sentences == null) continue;
            for (Annotation sentence : sentences) {
                Span sentenceSpan = sentence.span();
                Tokenizer.tokenize(doc, sentenceSpan);
                BIOWriter.writeTags(doc, sentenceSpan);
            }
        }
    }

    private static void writeTags(Document doc, Span sentenceSpan) {
        int posn = sentenceSpan.start();
        int end = sentenceSpan.end();
        posn = Tokenizer.skipWSX(doc, posn, end);
        String continuationTag = "O";
        int markupEnd = 0;
        while (posn < end) {
            String tokenTag;
            Annotation token = doc.tokenAt(posn);
            String tokenText = doc.text(token).trim();
            Vector<Annotation> enamexes = doc.annotationsAt(posn, "ENAMEX");
            if (enamexes != null && enamexes.size() > 0) {
                Annotation enamex = enamexes.get(0);
                String tag = (String)enamex.get("TYPE");
                if (markupEnd == 0) {
                    tokenTag = ("B-" + tag).intern();
                    continuationTag = ("I-" + tag).intern();
                    markupEnd = enamex.span().end();
                } else {
                    System.out.println("Nested tag " + tag + " ignored.");
                    System.out.println("(tag from annotation " + enamex + ")");
                    tokenTag = continuationTag;
                }
            } else {
                tokenTag = continuationTag;
            }
            writer.println(tokenText + " " + tokenTag);
            posn = token.span().end();
            if (markupEnd != 0 && posn > markupEnd) {
                System.out.println("Annotation does not end at token boundary");
                System.out.println("(annotation ends at " + markupEnd + ", token boundary is " + posn);
            }
            if (posn < markupEnd) continue;
            markupEnd = 0;
            continuationTag = "O";
        }
        if (markupEnd != 0) {
            System.out.println("Annotation extends past text [sentence] boundary");
            System.out.println("(annotation ends at " + markupEnd + ")");
        }
        writer.println();
    }

    private static void eraseAnnotationsOutside(Document doc, String type, Span span) {
        Vector v = doc.annotationsOfType(type);
        if (v == null) {
            return;
        }
        v = (Vector)v.clone();
        for (int i = 0; i < v.size(); ++i) {
            Annotation a = (Annotation)v.get(i);
            if (a.span().within(span)) continue;
            doc.removeAnnotation(a);
        }
    }
}

