/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.uk;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;

public class UkrainianWordTokenizer
implements Tokenizer {
    private static final String SPLIT_CHARS = " \u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb,.;()[]{}<>!?:/|\\\"\u00ab\u00bb\u201e\u201d\u201c`\u00b4\u2018\u201b\u2032\u2026\u00bf\u00a1\t\n\r\ue100";
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final char DECIMAL_COMMA_SUBST = '\ue001';
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final char NUMBER_DOT_SUBST = '\ue002';
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", 66);
    private static final char COLON_DOT_SUBST = '\ue003';
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final char DATE_DOT_SUBST = '\ue004';
    private static final Pattern BRACE_IN_WORD_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491'])\\(([\u0430-\u044f\u0456\u0457\u0454\u0491']+)\\)", 66);
    private static final char LEFT_BRACE_SUBST = '\ue005';
    private static final char RIGHT_BRACE_SUBST = '\ue006';
    private static final Pattern ABBR_DOT_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491])\\. ([\u0430-\u044f\u0456\u0457\u0454\u0491])");
    private static final char ABBR_DOT_SUBST = '\ue007';
    private static final String ELLIPSIS = "...";
    private static final String ELLIPSIS_SUBST = "\ue100";

    public List<String> tokenize(String text) {
        if ((text = UkrainianWordTokenizer.cleanup(text)).contains(",")) {
            text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll("$1\ue001$2");
        }
        if (text.contains(".")) {
            text = DATE_PATTERN.matcher(text).replaceAll("$1\ue004$2\ue004$3");
            text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1\ue002$2");
            text = ABBR_DOT_PATTERN.matcher(text).replaceAll("$1\ue007 $2");
        }
        if (text.contains(":")) {
            text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll("$1\ue003$2");
        }
        if (text.contains("(")) {
            text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1\ue005$2\ue006");
        }
        if (text.contains(ELLIPSIS)) {
            text = text.replace(ELLIPSIS, ELLIPSIS_SUBST);
        }
        ArrayList<String> tokenList = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
        while (st.hasMoreElements()) {
            String token = st.nextToken();
            token = token.replace('\ue001', ',');
            token = token.replace('\ue004', '.');
            token = token.replace('\ue002', '.');
            token = token.replace('\ue007', '.');
            token = token.replace('\ue003', ':');
            token = token.replace('\ue005', '(');
            token = token.replace('\ue006', ')');
            token = token.replaceAll(ELLIPSIS_SUBST, ELLIPSIS);
            tokenList.add(token);
        }
        return tokenList;
    }

    private static String cleanup(String text) {
        text = text.replace('\u2019', '\'').replace('\u02bc', '\'');
        return text;
    }
}

