/*
 * Decompiled with CFR 0.152.
 */
package eus.ixa.ixa.pipe.tok;

import eus.ixa.ixa.pipe.seg.RuleBasedSegmenter;
import eus.ixa.ixa.pipe.tok.NonPeriodBreaker;
import eus.ixa.ixa.pipe.tok.Normalizer;
import eus.ixa.ixa.pipe.tok.Token;
import eus.ixa.ixa.pipe.tok.TokenFactory;
import eus.ixa.ixa.pipe.tok.Tokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RuleBasedTokenizer
implements Tokenizer {
    public static Pattern replacement = Pattern.compile("\ufffd", 256);
    public static Pattern asciiHex = Pattern.compile("[\u0000- \u007f-\u00a0]", 256);
    public static Pattern generalBlankPunctuation = Pattern.compile("[\u2000-\u200f\u2028-\u202f\u205f-\u206f]", 256);
    public static Pattern doubleSpaces = Pattern.compile("[\\  ]+");
    public static Pattern specials = Pattern.compile("([^@#\\p{Alnum}\\p{Space}\\.\u2014\u8212\u2013\\-\\\u00bf\\?\\\u00a1\\!'`,/'\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])", 256);
    public static Pattern qexc = Pattern.compile("([\\\u00bf\\?\\\u00a1\\!]+)");
    public static Pattern spaceDashSpace = Pattern.compile("([\\ ]+[\u2014\u8212\u2013\\-/]+|[\u2014\u8212\u2013\\-/]+[\\ ]+)");
    public static Pattern multiDots = Pattern.compile("\\.([\\.]+)");
    public static Pattern dotmultiDot = Pattern.compile("DOTMULTI\\.");
    public static Pattern dotmultiDotAny = Pattern.compile("DOTMULTI\\.([^\\.])");
    public static Pattern noDigitComma = Pattern.compile("([^\\p{Digit}])(,)", 256);
    public static Pattern commaNoDigit = Pattern.compile("(,)([^\\p{Digit}])", 256);
    public static Pattern digitCommaNoDigit = Pattern.compile("([\\p{Digit}])(,)([^\\p{Digit}])", 256);
    public static Pattern noDigitCommaDigit = Pattern.compile("([^\\p{Digit}])(,)(\\p{Digit})", 256);
    public static final String TLD = "\\.asp|\\.at|\\.au|\\.az|\\.be|\\.biz|\\.cat|\\.ch|\\.com|\\.cym|\\.cz|\\.de|\\.dk|\\.edu|\\.es|\\.eu|\\.eus|\\.fr|\\.gal|\\.gov|\\.hk|\\.hu|\\.ie|\\.il|\\.info|\\.htm|\\.html|\\.it|\\.jp|\\.pl|\\.pt|\\.net|\\.nl|\\.org|\\.ru|\\.se|\\.sg|\\.sv|\\.uk|\\.zw";
    public static Pattern wrongLink = Pattern.compile("((http|ftp)\\s:\\s//\\s*[\\s\\p{Alpha}\\p{Digit}+&@#/%?=~_|!:,.;-]+(\\.asp|\\.at|\\.au|\\.az|\\.be|\\.biz|\\.cat|\\.ch|\\.com|\\.cym|\\.cz|\\.de|\\.dk|\\.edu|\\.es|\\.eu|\\.eus|\\.fr|\\.gal|\\.gov|\\.hk|\\.hu|\\.ie|\\.il|\\.info|\\.htm|\\.html|\\.it|\\.jp|\\.pl|\\.pt|\\.net|\\.nl|\\.org|\\.ru|\\.se|\\.sg|\\.sv|\\.uk|\\.zw))", 256);
    public static Pattern beginLink = Pattern.compile("(http|ftp)(\\s:\\s)(/\\s*/\\s*)");
    public static Pattern endLink = Pattern.compile("(\\.asp|\\.at|\\.au|\\.az|\\.be|\\.biz|\\.cat|\\.ch|\\.com|\\.cym|\\.cz|\\.de|\\.dk|\\.edu|\\.es|\\.eu|\\.eus|\\.fr|\\.gal|\\.gov|\\.hk|\\.hu|\\.ie|\\.il|\\.info|\\.htm|\\.html|\\.it|\\.jp|\\.pl|\\.pt|\\.net|\\.nl|\\.org|\\.ru|\\.se|\\.sg|\\.sv|\\.uk|\\.zw)\\s+(/)");
    public static Pattern noAlphaAposNoAlpha = Pattern.compile("([^\\p{Alpha}])(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])([^\\p{Alpha}])", 256);
    public static Pattern noAlphaDigitAposAlpha = Pattern.compile("([^\\p{Alpha}\\d])(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])(\\p{Alpha})", 256);
    public static Pattern alphaAposNonAlpha = Pattern.compile("(\\p{Alpha})(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])([^\\p{Alpha}])", 256);
    public static Pattern AlphaAposAlpha = Pattern.compile("(\\p{Alpha})(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])(\\p{Alpha})", 256);
    public static Pattern englishNegations = Pattern.compile("(\\p{Alpha})(n['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])([t])", 256);
    public static Pattern englishApos = Pattern.compile("(\\p{Alpha})(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])([msdMSD]|re|ve|ll)", 256);
    public static Pattern yearApos = Pattern.compile("([\\p{Digit}])(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])([s])", 256);
    public static Pattern endOfSentenceApos = Pattern.compile("([^\\p{Alpha}])(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])$");
    public static Pattern deTokenEnglishNegation = Pattern.compile("([n])(['\u0091\u0092\u2019\u201a\u201b\u203a\u2018\u2039])\\s+([t])", 256);
    public static Pattern detokenParagraphs = Pattern.compile("(\u00b6)[\\ ]*(\u00b6)", 256);
    private static boolean DEBUG = false;
    private final TokenFactory tokenFactory;
    private final NonPeriodBreaker nonBreaker;
    private final String lang;
    private final String originalText;
    private boolean unTokenizable;

    public RuleBasedTokenizer(String text, Properties properties) {
        this.lang = properties.getProperty("language");
        this.printUntokenizable(properties);
        this.nonBreaker = new NonPeriodBreaker(properties);
        this.tokenFactory = new TokenFactory();
        this.originalText = RuleBasedSegmenter.buildText(text);
    }

    @Override
    public List<List<Token>> tokenize(String[] sentences) {
        long start = System.nanoTime();
        int noTokens = 0;
        int prevIndex = 0;
        int curIndex = 0;
        String language = this.lang;
        ArrayList<List<Token>> result = new ArrayList<List<Token>>();
        String offsetText = this.originalText;
        for (String sentence : sentences) {
            String[] curTokens;
            if (DEBUG) {
                System.err.println("-> Segmented:" + sentence);
            }
            ArrayList<Token> tokens = new ArrayList<Token>();
            for (String arrayToken : curTokens = this.getTokens(sentence)) {
                curIndex = offsetText.indexOf(arrayToken, prevIndex);
                if (curIndex == -1) {
                    curIndex = prevIndex + 1;
                }
                Token curToken = this.tokenFactory.createToken(arrayToken, curIndex, arrayToken.length());
                this.addTokens(curToken, tokens);
                if (DEBUG) {
                    System.err.println("-> Token:" + arrayToken + " curIndex: " + curIndex + " prev: " + prevIndex);
                }
                prevIndex = curIndex + curToken.tokenLength();
            }
            result.add(tokens);
            noTokens += curTokens.length;
        }
        RuleBasedTokenizer.normalizeTokens(result, language);
        long duration = System.nanoTime() - start;
        double toksPerSecond = (double)noTokens / ((double)duration / 1.0E9);
        System.err.printf("ixa-pipe-tok tokenized %d tokens at %.2f tokens per second.%n", noTokens, toksPerSecond);
        return result;
    }

    private String[] getTokens(String line) {
        line = line.trim();
        line = doubleSpaces.matcher(line).replaceAll(" ");
        line = asciiHex.matcher(line).replaceAll(" ");
        line = generalBlankPunctuation.matcher(line).replaceAll(" ");
        line = qexc.matcher(line).replaceAll(" $1 ");
        line = spaceDashSpace.matcher(line).replaceAll(" $1 ");
        line = specials.matcher(line).replaceAll(" $1 ");
        line = this.generateMultidots(line);
        line = noDigitComma.matcher(line).replaceAll("$1 $2");
        line = commaNoDigit.matcher(line).replaceAll("$1 $2");
        line = digitCommaNoDigit.matcher(line).replaceAll("$1 $2 $3");
        line = noDigitCommaDigit.matcher(line).replaceAll("$1 $2 $3");
        line = this.treatContractions(line);
        line = this.nonBreaker.TokenizerNonBreaker(line);
        line = this.restoreMultidots(line);
        line = this.detokenizeURLs(line);
        line = beginLink.matcher(line).replaceAll("$1://");
        line = endLink.matcher(line).replaceAll("$1$2");
        line = line.trim();
        line = doubleSpaces.matcher(line).replaceAll(" ");
        line = detokenParagraphs.matcher(line).replaceAll("$1$2");
        if (DEBUG) {
            System.out.println("->Tokens:" + line);
        }
        String[] tokens = line.split(" ");
        return tokens;
    }

    private String generateMultidots(String line) {
        line = multiDots.matcher(line).replaceAll(" DOTMULTI$1 ");
        Matcher dotMultiDot = dotmultiDot.matcher(line);
        while (dotMultiDot.find()) {
            line = dotmultiDotAny.matcher(line).replaceAll("DOTDOTMULTI $1");
            line = dotMultiDot.replaceAll("DOTDOTMULTI");
            dotMultiDot.reset(line);
        }
        return line;
    }

    private String restoreMultidots(String line) {
        while (line.contains("DOTDOTMULTI")) {
            line = line.replaceAll("DOTDOTMULTI", "DOTMULTI.");
        }
        line = line.replaceAll("DOTMULTI", ".");
        return line;
    }

    private String treatContractions(String line) {
        line = noAlphaAposNoAlpha.matcher(line).replaceAll("$1 $2 $3");
        line = noAlphaDigitAposAlpha.matcher(line).replaceAll("$1 $2 $3");
        line = alphaAposNonAlpha.matcher(line).replaceAll("$1 $2 $3");
        if (this.lang.equalsIgnoreCase("en")) {
            line = englishNegations.matcher(line).replaceAll("$1 $2$3");
        }
        line = englishApos.matcher(line).replaceAll("$1 $2$3");
        line = yearApos.matcher(line).replaceAll("$1 $2$3");
        line = AlphaAposAlpha.matcher(line).replaceAll("$1$2 $3");
        line = endOfSentenceApos.matcher(line).replaceAll("$1 $2");
        line = deTokenEnglishNegation.matcher(line).replaceAll("$1$2$3");
        return line;
    }

    private String detokenizeURLs(String line) {
        Matcher linkMatcher = wrongLink.matcher(line);
        StringBuffer sb = new StringBuffer();
        while (linkMatcher.find()) {
            linkMatcher.appendReplacement(sb, linkMatcher.group().replaceAll("\\s", ""));
        }
        linkMatcher.appendTail(sb);
        line = sb.toString();
        return line;
    }

    private void printUntokenizable(Properties properties) {
        String untokenizable = properties.getProperty("untokenizable");
        this.unTokenizable = untokenizable.equalsIgnoreCase("yes");
    }

    private void addTokens(Token curToken, List<Token> tokens) {
        if (curToken.tokenLength() != 0) {
            if (this.unTokenizable) {
                tokens.add(curToken);
            } else if (!this.unTokenizable && !replacement.matcher(curToken.getTokenValue()).matches()) {
                tokens.add(curToken);
            }
        }
    }

    public static void normalizeTokens(List<List<Token>> tokens, String lang) {
        for (List<Token> sentence : tokens) {
            Normalizer.convertNonCanonicalStrings(sentence, lang);
            Normalizer.normalizeQuotes(sentence, lang);
            Normalizer.normalizeDoubleQuotes(sentence, lang);
        }
    }
}

