/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.uk;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;

public class UkrainianWordTokenizer
implements Tokenizer {
    private static final String SPLIT_CHARS = " \u00a0\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u201a\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb,.;()[]{}<>!?:/|\\\"\u00ab\u00bb\u201e\u201d\u201c\u2026\u00bf\u00a1=\t\n\r\ue100\ue101\ue102\ue110";
    private static final char DECIMAL_COMMA_SUBST = '\ue001';
    private static final char NON_BREAKING_SPACE_SUBST = '\ue002';
    private static final char NON_BREAKING_DOT_SUBST = '\ue003';
    private static final char NON_BREAKING_COLON_SUBST = '\ue004';
    private static final char NON_BREAKING_SLASH_SUBST = '\ue007';
    private static final Pattern WEIRD_APOSTROPH_PATTERN = Pattern.compile("([\u0431\u0432\u0434\u0436\u0437\u043a\u043b\u043c\u043d\u043f\u0440\u0441\u0442\u0444\u0445\u0448])[\"\u201d\u201f]([\u0454\u0457\u044e\u044f])", 66);
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final String DECIMAL_COMMA_REPL = "$1\ue001$2";
    private static final Pattern DECIMAL_SPACE_PATTERN = Pattern.compile("(?<=^|[\\s(])\\d{1,3}([ \u00a0\u202f][\\d]{3})+(?=[\\s(]|$)", 66);
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final String DOTTED_NUMBERS_REPL = "$1\ue003$2";
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", 66);
    private static final String COLON_NUMBERS_REPL = "$1\ue004$2";
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final String DATE_PATTERN_REPL = "$1\ue003$2\ue003$3";
    private static final Pattern BRACE_IN_WORD_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491'])\\(([\u0430-\u044f\u0456\u0457\u0454\u0491']+)\\)", 66);
    private static final char LEFT_BRACE_SUBST = '\ue005';
    private static final char RIGHT_BRACE_SUBST = '\ue006';
    private static final String BREAKING_PLACEHOLDER = "\ue110";
    private static final Pattern ABBR_DOT_VO_PATTERN1 = Pattern.compile("(\u0432)\\.([\\s\u00a0\u202f]*\u043e)\\.");
    private static final Pattern ABBR_DOT_VO_PATTERN2 = Pattern.compile("(\u043a)\\.([\\s\u00a0\u202f]*\u0441)\\.");
    private static final Pattern ABBR_DOT_VO_PATTERN3 = Pattern.compile("(\u0447|\u0441\u0442)\\.([\\s\u00a0\u202f]*\u043b)\\.");
    private static final Pattern ABBR_DOT_TYS_PATTERN1 = Pattern.compile("([0-9I\u0406][\\s\u00a0\u202f]+)(\u0442\u0438\u0441|\u0430\u0440\u0442)\\.");
    private static final Pattern ABBR_DOT_TYS_PATTERN2 = Pattern.compile("(\u0442\u0438\u0441|\u0430\u0440\u0442)\\.([\\s\u00a0\u202f]+[\u0430-\u044f\u0456\u0457\u0454\u04910-9])");
    private static final Pattern ABBR_DOT_LAT_PATTERN = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]\u043b\u0430\u0442)\\.([\\s\u00a0\u202f]+[a-zA-Z])");
    private static final Pattern ABBR_DOT_PROF_PATTERN = Pattern.compile("([\u0410\u0430]\u043a\u0430\u0434|[\u041f\u043f]\u0440\u043e\u0444|[\u0414\u0434]\u043e\u0446|[\u0410\u0430]\u0441\u0438\u0441\u0442|[\u0410\u0430]\u0440\u0445|\u0432\u0443\u043b|\u043e|\u0440|\u0456\u043c|\u0443\u043f\u043e\u0440\u044f\u0434)\\.([\\s\u00a0\u202f]+[\u0410-\u042f\u0406\u0407\u0404\u0490])");
    private static final Pattern INITIALS_DOT_PATTERN_SP_2 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\s\u00a0\u202f]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\s\u00a0\u202f]?[\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)");
    private static final Pattern INITIALS_DOT_PATTERN_SP_1 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\s\u00a0\u202f]?[\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)");
    private static final Pattern INITIALS_DOT_PATTERN_RSP_2 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)([\\s\u00a0\u202f]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\s\u00a0\u202f]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.");
    private static final Pattern INITIALS_DOT_PATTERN_RSP_1 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)([\\s\u00a0\u202f]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.");
    private static final String INITIALS_DOT_REPL_SP_2 = "$1\ue003\ue110$2\ue003\ue110$3";
    private static final String INITIALS_DOT_REPL_SP_1 = "$1\ue003\ue110$2";
    private static final String INITIALS_DOT_REPL_RSP_2 = "$1\ue110$2\ue003\ue110$3\ue003\ue110";
    private static final String INITIALS_DOT_REPL_RSP_1 = "$1\ue110$2\ue003\ue110";
    private static final Pattern ABBR_DOT_KUB_SM_PATTERN = Pattern.compile("(\u043a\u0432|\u043a\u0443\u0431)\\.([\\s\u00a0\u202f]*(?:[\u0441\u043c\u043a\u0434]|\u043c\u043a)?\u043c)");
    private static final Pattern ABBR_DOT_S_G_PATTERN = Pattern.compile("(\u0441)\\.(-\u0433)\\.");
    private static final Pattern ABBR_DOT_PN_ZAH_PATTERN = Pattern.compile("(\u043f\u043d|\u043f\u0434)\\.(-(\u0437\u0430\u0445|\u0441\u0445))\\.");
    private static final Pattern ABBR_DOT_2_SMALL_LETTERS_PATTERN = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491'-][\u0432\u0435\u043a\u043d\u043f\u0440\u0441\u0442\u0446\u0447]{1,2})\\.([\u0435\u043a\u043c\u043d\u043f\u0440\u0441\u0442\u0447]{1,2})\\.");
    private static final String ABBR_DOT_2_SMALL_LETTERS_REPL = "$1\ue003\ue110$2\ue003\ue110";
    private static final String ONE_DOT_TWO_REPL = "$1\ue003$2";
    private static final Pattern ABBR_DOT_NON_ENDING_PATTERN = Pattern.compile("(?<![\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-])(\u0430\u0431\u0437|\u0430\u043c\u0435\u0440|\u0430\u043d\u0433\u043b|\u0430\u043a\u0430\u0434(\u0435\u043c)?|\u0430\u0440\u043a|\u0430\u0443\u0434|\u0431\u043b(?:\u0438\u0437\u044c\u043a)?|\u0431\u0443\u0434|\u0432|\u0432\u0438\u043f|\u0432\u0456\u0440\u043c|\u0433\u0440\u0435\u0446(?:\u044c\u043a)|\u0434\u0435\u0440\u0436|\u0434\u0438\u0432|\u0434\u043e\u0434|\u0434\u043e\u043b|\u0434\u043e\u0441\u043b|\u0434\u043e\u0446|\u0434\u043e\u043f|\u0435\u043a\u043e\u043d|\u0435\u043b|\u0436\u0456\u043d|\u0437\u0430\u0432|\u0437\u0430\u0441\u0442|\u0437\u0430\u0445|\u0437\u0431|\u0437\u0432|\u0437\u043e\u0432\u043d|\u0456\u043c|\u0456\u0432\u0440|\u0456\u0441\u043f|\u0456\u0441\u0442|\u0456\u0442\u0430\u043b|\u043a|\u043a\u0430\u0431|\u043a\u0430\u0444|\u043a\u0430\u043d\u0434|\u043a\u0432|[1-9]-\u043a\u0456\u043c\u043d|\u043a\u0456\u043c\u043d|\u043a\u043b|\u043a\u043d|\u043a\u043e\u0435\u0444|\u043c\u0430\u043b|\u043c\u043e\u0431|\u043d|\u043d\u0430\u043f\u0440|\u043d\u0430\u0446|\u043e\u043f|\u043e\u0444|\u043f|\u043f\u0435\u043d|\u043f\u0435\u0440\u0435\u043a\u043b|\u043f\u043b|\u043f\u043e\u043b|\u043f\u043e\u0432|\u043f\u043e\u0440|\u043f\u043e\u0447|\u043f\u043f|\u043f\u0440\u0438\u0431\u043b|\u043f\u0440\u043e\u0432|\u043f\u0440\u043e\u043c|\u043f\u0440\u043e\u0441\u043f|[\u0420\u0440]\u0435\u0434|[\u0420\u0440]\u0435\u0436|\u0440\u043e\u0437\u0434|\u0440\u0442|\u0441|[\u0421\u0441]\u0432\u0432?|\u0441\u043a\u043e\u0440|\u0441\u043e\u0446|\u0441\u043f\u0456\u0432\u0430\u0432\u0442|\u0441\u0442|\u0441\u0442\u043e\u0440|\u0441\u0445|\u0442\u0430\u0431\u043b|[\u0442\u0422]\u0435\u043b|\u0443\u043a\u0440|\u0444\u0456\u043b\u043e\u043b|\u0444\u0440|\u0444\u0440\u0430\u043d\u0446|\u0447|\u0447\u0430\u0439\u043d|\u0446|\u044f\u043f)\\.(?!$)");
    private static final Pattern ABBR_DOT_NON_ENDING_PATTERN_2 = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]\u043c)\\.([\\s\u00a0\u202f]*[\u0410-\u042f\u0406\u0407\u0404\u0490])");
    private static final Pattern ABBR_DOT_ENDING_PATTERN = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]((\u0442\u0430|\u0439) \u0456\u043d|\u0456\u043d\u0448|\u0430\u0442\u043c|\u0432\u0456\u0434\u0441|\u0433\u0440|\u0435|\u043a\u043e\u043f|\u043e\u0431\u043b|\u0440|\u0440\u0440|\u0440\u0443\u0431|\u0441\u0442|\u0441\u0442\u043e\u043b|\u0441\u0442\u043e\u0440|\u0447\u043e\u043b|\u0448\u0442))\\.");
    private static final Pattern ABBR_DOT_I_T_P_PATTERN = Pattern.compile("([\u0456\u0439][\\s\u00a0\u202f]+\u0442)\\.([\\s\u00a0\u202f]*(\u0434|\u043f|\u0456\u043d))\\.");
    private static final Pattern ABBR_DOT_T_ZV_PATTERN = Pattern.compile("([\\s\u00a0\u202f]+\u0442)\\.([\\s\u00a0\u202f]*(\u0437\u0432))\\.");
    private static final Pattern ABBR_AT_THE_END = Pattern.compile("(?<![\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])(\u0442\u0438\u0441|[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.$");
    private static final Pattern YEAR_WITH_R = Pattern.compile("((?:[12][0-9]{3}[\u2014\u2013-])?[12][0-9]{3})(\u0440\u0440?\\.)");
    private static final Pattern ABBR_DOT_RED_AVT_PATTERN = Pattern.compile("([\\s\u00a0\u202f]+(?:[\u0420\u0440]\u0435\u0434|[\u0410\u0430]\u0432\u0442))\\.([\\)\\]])");
    private static final String ELLIPSIS = "...";
    private static final String ELLIPSIS_SUBST = "\ue100";
    private static final String ELLIPSIS2 = "!..";
    private static final String ELLIPSIS2_SUBST = "\ue101";
    private static final String ELLIPSIS3 = "?..";
    private static final String ELLIPSIS3_SUBST = "\ue102";
    private static final String SOFT_HYPHEN_WRAP = "\u00ad\n";
    private static final String SOFT_HYPHEN_WRAP_SUBST = "\ue103";
    private static final Pattern URL_PATTERN = Pattern.compile("^(https?|ftp)://[^\\s/$.?#].[^\\s]*$", 2);
    private static final int URL_START_REPLACE_CHAR = 58112;

    public List<String> tokenize(String text) {
        Matcher spacedDecimalMatcher;
        int dotIndex;
        boolean dotInsideSentence;
        Matcher matcher;
        HashMap<String, String> urls = new HashMap<String, String>();
        if ((text = UkrainianWordTokenizer.cleanup(text)).contains(",")) {
            text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL);
        }
        if (text.contains("tp")) {
            matcher = URL_PATTERN.matcher(text);
            int urlReplaceChar = 58112;
            while (matcher.find()) {
                String urlGroup = matcher.group();
                String replaceChar = String.valueOf((char)urlReplaceChar);
                urls.put(replaceChar, urlGroup);
                text = matcher.replaceAll(replaceChar);
                ++urlReplaceChar;
            }
        }
        if (text.indexOf(8212) != -1) {
            text = text.replaceAll("\u2014(\\s)", "\ue110\u2014$1");
        }
        if (text.indexOf("\u0441/\u0433") != -1) {
            text = text.replaceAll("\u0441/\u0433", "\u0441\ue007\u0433");
        }
        if (text.indexOf("\u0440.") != -1 && (matcher = YEAR_WITH_R.matcher(text)).find()) {
            text = matcher.replaceAll("$1\ue110$2");
        }
        boolean bl = dotInsideSentence = (dotIndex = text.indexOf(".")) >= 0 && dotIndex < text.length() - 1;
        if (dotInsideSentence || dotIndex == text.length() - 1 && ABBR_AT_THE_END.matcher(text).find()) {
            if (text.contains(ELLIPSIS)) {
                text = text.replace(ELLIPSIS, ELLIPSIS_SUBST);
            }
            if (text.contains(ELLIPSIS2)) {
                text = text.replace(ELLIPSIS2, ELLIPSIS2_SUBST);
            }
            if (text.contains(ELLIPSIS3)) {
                text = text.replace(ELLIPSIS3, ELLIPSIS3_SUBST);
            }
            text = DATE_PATTERN.matcher(text).replaceAll(DATE_PATTERN_REPL);
            text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1\ue003$2");
            text = ABBR_DOT_2_SMALL_LETTERS_PATTERN.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_VO_PATTERN1.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_VO_PATTERN2.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_VO_PATTERN3.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_TYS_PATTERN1.matcher(text).replaceAll("$1$2\ue003\ue110");
            text = ABBR_DOT_TYS_PATTERN2.matcher(text).replaceAll("$1\ue003$2");
            text = ABBR_DOT_LAT_PATTERN.matcher(text).replaceAll("$1\ue003$2");
            text = ABBR_DOT_PROF_PATTERN.matcher(text).replaceAll("$1\ue003$2");
            text = INITIALS_DOT_PATTERN_SP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_2);
            text = INITIALS_DOT_PATTERN_SP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_1);
            text = INITIALS_DOT_PATTERN_RSP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_2);
            text = INITIALS_DOT_PATTERN_RSP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_1);
            text = ABBR_DOT_KUB_SM_PATTERN.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_1);
            text = ABBR_DOT_S_G_PATTERN.matcher(text).replaceAll("$1\ue003$2\ue003\ue110");
            text = ABBR_DOT_PN_ZAH_PATTERN.matcher(text).replaceAll("$1\ue003$2\ue003\ue110");
            text = ABBR_DOT_I_T_P_PATTERN.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_T_ZV_PATTERN.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_RED_AVT_PATTERN.matcher(text).replaceAll("$1\ue003$2");
            text = ABBR_DOT_NON_ENDING_PATTERN.matcher(text).replaceAll("$1\ue003\ue110");
            text = ABBR_DOT_NON_ENDING_PATTERN_2.matcher(text).replaceAll("$1\ue003$2");
        }
        if (text.contains("*")) {
            text = text.replaceAll("((?:^|[^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])\\*+)([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])", "$1\ue110$2");
            text = text.replaceAll("([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])(\\*+(?:[^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490]|$))", "$1\ue110$2");
        }
        if ((spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text = ABBR_DOT_ENDING_PATTERN.matcher(text).replaceAll("$1\ue003"))).find()) {
            StringBuffer sb = new StringBuffer();
            do {
                String splitNumber = spacedDecimalMatcher.group(0);
                String splitNumberAdjusted = splitNumber.replace(' ', '\ue002');
                splitNumberAdjusted = splitNumberAdjusted.replace('\u00a0', '\ue002');
                splitNumberAdjusted = splitNumberAdjusted.replace('\u202f', '\ue002');
                spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
            } while (spacedDecimalMatcher.find());
            spacedDecimalMatcher.appendTail(sb);
            text = sb.toString();
        }
        if (text.contains(":")) {
            text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL);
        }
        if (text.contains("(")) {
            text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1\ue005$2\ue006");
        }
        if (text.contains(SOFT_HYPHEN_WRAP)) {
            text = text.replace(SOFT_HYPHEN_WRAP, SOFT_HYPHEN_WRAP_SUBST);
        }
        ArrayList<String> tokenList = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
        while (st.hasMoreElements()) {
            String token = st.nextToken();
            if (token.equals(BREAKING_PLACEHOLDER)) continue;
            token = token.replace('\ue001', ',');
            token = token.replace('\ue007', '/');
            token = token.replace('\ue004', ':');
            token = token.replace('\ue002', ' ');
            token = token.replace('\ue005', '(');
            token = token.replace('\ue006', ')');
            token = token.replace('\ue003', '.');
            if (dotInsideSentence) {
                token = token.replace(ELLIPSIS_SUBST, ELLIPSIS);
                token = token.replace(ELLIPSIS2_SUBST, ELLIPSIS2);
                token = token.replace(ELLIPSIS3_SUBST, ELLIPSIS3);
            }
            token = token.replace(SOFT_HYPHEN_WRAP_SUBST, SOFT_HYPHEN_WRAP);
            if (!urls.isEmpty()) {
                for (Map.Entry entry : urls.entrySet()) {
                    token = token.replace((CharSequence)entry.getKey(), (CharSequence)entry.getValue());
                }
            }
            tokenList.add(token);
        }
        return tokenList;
    }

    private static String cleanup(String text) {
        text = text.replace('\u2019', '\'').replace('\u02bc', '\'').replace('\u2018', '\'').replace('`', '\'').replace('\u00b4', '\'').replace('\u201a', ',').replace('\u2011', '-');
        text = WEIRD_APOSTROPH_PATTERN.matcher(text).replaceAll("$1'$2");
        return text;
    }
}

