/*
 * Decompiled with CFR 0.152.
 */
package edu.nyu.jet.lex;

import edu.nyu.jet.lisp.FeatureSet;
import edu.nyu.jet.tipster.Annotation;
import edu.nyu.jet.tipster.Document;
import edu.nyu.jet.tipster.Span;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Tokenizer {
    static Vector tokens;
    static String lastToken;
    static HashSet suffixes2;
    static HashSet suffixes3;
    private static HashMap<Integer, Integer> specialTokenEnd;
    private static HashMap<Integer, String> specialTokenType;
    private static String userNamePatStg;
    private static String domainNamePatStg;
    private static String emailPatStg;
    private static String pathPatStg;
    private static String urlPatStg;
    private static Pattern emailPat;
    private static Pattern urlPat;

    public static void tokenize(Document doc, Span span) {
        Tokenizer.findTokens(doc, doc.text(), span.start(), span.end());
    }

    public static String[] tokenize(String text) {
        tokens = new Vector();
        Tokenizer.findTokens(null, text, 0, text.length());
        return tokens.toArray(new String[0]);
    }

    private static void findTokens(Document doc, String text, int ic, int end) {
        boolean firstBlock = true;
        lastToken = "";
        Tokenizer.findTokensByPattern(doc, text, ic, end);
        ic = Tokenizer.skipWSX(text, ic, end);
        while (ic < end) {
            int tokenStart = ic;
            Integer tokenEnd = specialTokenEnd.get(ic);
            ic = tokenEnd != null ? tokenEnd : ++ic;
            while (ic < end && !Character.isWhitespace(text.charAt(ic))) {
                tokenEnd = specialTokenEnd.get(ic);
                if (tokenEnd != null) {
                    ic = tokenEnd;
                    continue;
                }
                ++ic;
            }
            String block = text.substring(tokenStart, ic);
            while (ic < end && Character.isWhitespace(text.charAt(ic))) {
                ++ic;
            }
            boolean lastBlock = ic >= end && doc != null;
            boolean[] newToken = Tokenizer.splitIntoTokens(block, tokenStart, lastBlock);
            Tokenizer.buildTokens(doc, block, newToken, tokenStart, ic, firstBlock);
            firstBlock = false;
        }
    }

    private static void findTokensByPattern(Document doc, String text, int start, int end) {
        Matcher emailMatcher = emailPat.matcher(text).region(start, end);
        specialTokenEnd = new HashMap();
        specialTokenType = new HashMap();
        while (emailMatcher.find()) {
            int tokenStart = emailMatcher.start();
            int tokenEnd = emailMatcher.end();
            specialTokenEnd.put(tokenStart, tokenEnd);
            specialTokenType.put(tokenStart, "email");
        }
        Matcher urlMatcher = urlPat.matcher(text).region(start, end);
        while (urlMatcher.find()) {
            int tokenStart = urlMatcher.start();
            int tokenEnd = urlMatcher.end();
            specialTokenEnd.put(tokenStart, tokenEnd);
            specialTokenType.put(tokenStart, "url");
        }
    }

    private static boolean[] splitIntoTokens(String blockString, int blockStart, boolean lastBlock) {
        char c;
        int i;
        char[] block = blockString.toCharArray();
        int blockLength = block.length;
        boolean[] newToken = new boolean[blockLength + 1];
        newToken[blockLength] = true;
        for (i = 0; i < blockLength; ++i) {
            c = block[i];
            if (Character.isLetterOrDigit(c) || c == '.') continue;
            newToken[i] = true;
            newToken[i + 1] = true;
        }
        for (i = 0; i < blockLength - 1; ++i) {
            c = block[i];
            if (c != '`' && c != '\'' && c != '-' || c != block[i + 1] || !newToken[i]) continue;
            newToken[i + 1] = false;
        }
        for (i = 0; i < blockLength - 2; ++i) {
            if (block[i] != '.' || block[i + 1] != '.' || block[i + 2] != '.' || !newToken[i]) continue;
            newToken[i + 1] = false;
            newToken[i + 2] = false;
        }
        for (i = 1; i < blockLength - 2; ++i) {
            if (block[i] != ',' || !Character.isDigit(block[i - 1]) || !Character.isDigit(block[i + 1])) continue;
            newToken[i] = false;
            newToken[i + 1] = false;
        }
        if (lastBlock) {
            if (block[blockLength - 1] == '.') {
                newToken[blockLength - 1] = true;
            } else if (blockLength > 1 && block[blockLength - 2] == '.' && "\"'}>)".indexOf(block[blockLength - 1]) >= 0) {
                newToken[blockLength - 2] = true;
            } else if (blockLength > 2 && block[blockLength - 3] == '.' && block[blockLength - 2] == '\'' && block[blockLength - 1] == '\'') {
                newToken[blockLength - 3] = true;
            }
        }
        for (i = 0; i < blockLength - 2; ++i) {
            if (!newToken[i + 3] || !suffixes3.contains(blockString.substring(i, i + 3))) continue;
            newToken[i] = true;
            newToken[i + 1] = false;
            newToken[i + 2] = false;
        }
        for (i = 0; i < blockLength - 1; ++i) {
            if (!newToken[i + 2] || !suffixes2.contains(blockString.substring(i, i + 2))) continue;
            newToken[i] = true;
            newToken[i + 1] = false;
        }
        for (i = 0; i < blockLength - 1; ++i) {
            if (block[i] != '&') continue;
            for (int j = i + 1; j < blockLength; ++j) {
                if (block[j] != ';') continue;
                for (int k = i + 1; k <= j; ++k) {
                    newToken[k] = false;
                }
            }
        }
        for (i = 0; i < blockLength; ++i) {
            Integer tokenEnd = specialTokenEnd.get(blockStart + i);
            if (tokenEnd == null) continue;
            newToken[i] = true;
            for (int j = i + 1; j < blockLength && j + blockStart < tokenEnd; ++j) {
                newToken[j] = false;
            }
        }
        return newToken;
    }

    private static void buildTokens(Document doc, String block, boolean[] newToken, int offset, int nextBlockStart, boolean firstBlock) {
        int tokenStart = 0;
        for (int i = 1; i <= block.length(); ++i) {
            String type;
            if (!newToken[i]) continue;
            int tokenEnd = i;
            FeatureSet fs = null;
            int value = 0;
            for (int j = tokenStart; j < tokenEnd; ++j) {
                if (Character.isDigit(block.charAt(j))) {
                    value = value * 10 + Character.digit(block.charAt(j), 10);
                    continue;
                }
                if (block.charAt(j) == ',' && value > 0) continue;
                value = -1;
                break;
            }
            fs = (type = specialTokenType.get(tokenStart + offset)) != null ? new FeatureSet("type", type) : (Character.isUpperCase(block.charAt(tokenStart)) ? (firstBlock || lastToken.equals("_") || lastToken.equals("\"") || lastToken.equals("``") || lastToken.equals("`") ? new FeatureSet("case", "forcedCap") : new FeatureSet("case", "cap")) : (value >= 0 ? new FeatureSet("intvalue", new Integer(value)) : new FeatureSet()));
            int spanEnd = tokenEnd == block.length() ? nextBlockStart : tokenEnd + offset;
            String tokenString = block.substring(tokenStart, tokenEnd);
            Tokenizer.recordToken(doc, tokenString, tokenStart + offset, spanEnd, fs);
            tokenStart = tokenEnd;
            lastToken = tokenString;
        }
    }

    private static void recordToken(Document doc, String text, int start, int end, FeatureSet fs) {
        if (doc == null) {
            tokens.addElement(text);
        } else {
            doc.annotate("token", new Span(start, end), fs);
            if (fs.get("type") != null) {
                doc.annotate("ENAMEX", new Span(start, end), new FeatureSet("TYPE", fs.get("type")));
            }
        }
    }

    public static void tokenizeOnWS(Document doc, Span span) {
        int ic;
        String text = doc.text();
        int end = span.end();
        for (ic = span.start(); ic < end && Character.isWhitespace(text.charAt(ic)); ++ic) {
        }
        while (ic < end) {
            int tokenStart = ic++;
            while (ic < end && !Character.isWhitespace(text.charAt(ic))) {
                ++ic;
            }
            while (ic < end && Character.isWhitespace(text.charAt(ic))) {
                ++ic;
            }
            Tokenizer.recordToken(doc, text, tokenStart, ic, new FeatureSet());
        }
    }

    public static int skipWS(Document doc, int posn, int end) {
        while (posn < end && Character.isWhitespace(doc.charAt(posn))) {
            ++posn;
        }
        return posn;
    }

    public static int skipWS(String text, int posn, int end) {
        while (posn < end && Character.isWhitespace(text.charAt(posn))) {
            ++posn;
        }
        return posn;
    }

    public static int skipWSX(Document doc, int posn, int end) {
        while (posn < end) {
            if (Character.isWhitespace(doc.charAt(posn))) {
                ++posn;
                continue;
            }
            if (doc.charAt(posn) != '<') break;
            ++posn;
            while (posn < end && doc.charAt(posn) != '>') {
                ++posn;
            }
            if (posn >= end) continue;
            ++posn;
        }
        return posn;
    }

    public static int skipWSX(String text, int posn, int end) {
        while (posn < end) {
            if (Character.isWhitespace(text.charAt(posn))) {
                ++posn;
                continue;
            }
            if (text.charAt(posn) != '<') break;
            ++posn;
            while (posn < end && text.charAt(posn) != '>') {
                ++posn;
            }
            if (posn >= end) continue;
            ++posn;
        }
        return posn;
    }

    public static Annotation[] gatherTokens(Document doc, Span span) {
        Annotation token;
        int start = span.start();
        int end = span.end();
        ArrayList<Annotation> tokens = new ArrayList<Annotation>();
        int posn = Tokenizer.skipWSX(doc, start, end);
        while (posn < end && (token = doc.tokenAt(posn)) != null) {
            tokens.add(token);
            posn = token.span().end();
        }
        int count = tokens.size();
        return tokens.toArray(new Annotation[count]);
    }

    public static String[] gatherTokenStrings(Document doc, Span span) {
        Annotation[] tokens = Tokenizer.gatherTokens(doc, span);
        int length = tokens.length;
        String[] stgs = new String[length];
        for (int i = 0; i < length; ++i) {
            stgs[i] = doc.text(tokens[i]).trim();
        }
        return stgs;
    }

    public static void main(String[] args) {
        Document doc = new Document(", DKo...@hotmail.com (Daniel Kolle)");
        Tokenizer.tokenize(doc, doc.fullSpan());
        String[] tokens = Tokenizer.gatherTokenStrings(doc, doc.fullSpan());
        if (tokens.length == 8 && tokens[0].equals("'") && tokens[1].equals("grishman ... @cs.nyu.edu") && tokens[2].equals("'") && tokens[3].equals("sold") && tokens[4].equals("$") && tokens[5].equals("3,100") && tokens[6].equals("shares") && tokens[7].equals(".")) {
            System.out.println("Tokenizer validation succeeds.");
        } else {
            System.out.println("Tokenizer validation fails.");
        }
        for (int i = 0; i < tokens.length; ++i) {
            System.out.println("  tokens[" + i + "] = " + tokens[i]);
        }
    }

    static {
        suffixes2 = new HashSet();
        suffixes3 = new HashSet();
        suffixes2.add("'s");
        suffixes2.add("'m");
        suffixes2.add("'d");
        suffixes2.add("'S");
        suffixes2.add("'M");
        suffixes2.add("'D");
        suffixes3.add("'re");
        suffixes3.add("'ve");
        suffixes3.add("n't");
        suffixes3.add("'ll");
        suffixes3.add("'RE");
        suffixes3.add("'VE");
        suffixes3.add("N'T");
        suffixes3.add("'LL");
        userNamePatStg = "[a-zA-Z0-9_\\.-]+";
        domainNamePatStg = "([a-zA-Z0-9-]+\\.)+[a-zA-Z0-9]+";
        emailPatStg = userNamePatStg + "( ?\\.\\.\\. ?)?@" + domainNamePatStg;
        pathPatStg = "[a-zA-Z0-9_=\\?/-]+";
        urlPatStg = "http://" + domainNamePatStg + pathPatStg;
        emailPat = Pattern.compile(emailPatStg);
        urlPat = Pattern.compile(urlPatStg);
    }
}

