/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.uk;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;

public class UkrainianWordTokenizer
implements Tokenizer {
    private static final String SPLIT_CHARS = "(!{2,3}|\\?{2,3}|\\.{3}|[!?][!?.]{1,2}|[ \u00a0\\n\\r\\t,.;!?\u2014:()\\[\\]{}<>/|\\\\\u2026\u00b0$\u20ac\u20b4=\u00bf\u00a1]|%(?![-\u2013][\u0430-\u044f\u0456\u0457\u0454\u0491])|(?<!\ue109)[\"\u00ab\u00bb\u201e\u201d\u201c]|(?<=[\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])[\u00b9\u00b2\u2070-\u2079]|[\u2000-\u200f\u201a\u2020-\u202f\u2030\u2031\u2033-\u206f\u2400-\u27ff" + String.valueOf(Character.toChars(126976)) + "-" + String.valueOf(Character.toChars(131071)) + "\uf000-\uffff\ue110])(?!\ue120)";
    private static final Pattern SPLIT_CHARS_REGEX = Pattern.compile(SPLIT_CHARS);
    private static final char DECIMAL_COMMA_SUBST = '\ue001';
    private static final char NON_BREAKING_SPACE_SUBST = '\ue002';
    private static final char NON_BREAKING_DOT_SUBST = '\ue003';
    private static final char NON_BREAKING_COLON_SUBST = '\ue004';
    private static final char LEFT_BRACE_SUBST = '\ue005';
    private static final char RIGHT_BRACE_SUBST = '\ue006';
    private static final char NON_BREAKING_SLASH_SUBST = '\ue007';
    private static final char LEFT_ANGLE_SUBST = '\ue008';
    private static final char RIGHT_ANGLE_SUBST = '\ue009';
    private static final char SLASH_SUBST = '\ue010';
    private static final String NON_BREAKING_PLACEHOLDER = "\ue109";
    private static final String BREAKING_PLACEHOLDER = "\ue110";
    private static final String NON_BREAKING_PLACEHOLDER2 = "\ue120";
    private static final Pattern WEIRD_APOSTROPH_PATTERN = Pattern.compile("([\u0431\u0432\u0434\u0436\u0437\u043a\u043b\u043c\u043d\u043f\u0440\u0441\u0442\u0444\u0445\u0448])([\"\u201d\u201f`\u00b4])([\u0454\u0457\u044e\u044f])", 66);
    public static final Pattern WORDS_WITH_BRACKETS_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491])\\[([\u0430-\u044f\u0456\u0457\u0454\u0491]+)\\]", 66);
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final String DECIMAL_COMMA_REPL = "$1\ue001$2";
    private static final Pattern DECIMAL_SPACE_PATTERN = Pattern.compile("(?<=^|[\\h\\v(])\\d{1,3}([\\h][\\d]{3})+(?=[\\h\\v(]|$)", 66);
    private static final Pattern DASH_NUMBERS_PATTERN = Pattern.compile("([IVX\u0406\u0425]+)([\u2013-])([IVX\u0406\u0425]+)");
    private static final String DASH_NUMBERS_REPL = "$1\ue110$2\ue110$3";
    private static final Pattern N_DASH_SPACE_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491a-z0-9])(\u2013\\h)(?!(\u0442\u0430|\u0447\u0438|\u0456|\u0439)[\\h\\v])", 66);
    private static final Pattern N_DASH_SPACE_PATTERN2 = Pattern.compile("([\\h.,;!?]\u2013)([\u0430-\u044f\u0456\u0457\u0454\u0491a-z])", 66);
    private static final String N_DASH_SPACE_REPL = "$1\ue110$2";
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])");
    private static final Pattern DOTTED_NUMBERS_PATTERN3 = Pattern.compile("([\\d])\\.([\\d]+)\\.([\\d])");
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])");
    private static final String COLON_NUMBERS_REPL = "$1\ue004$2";
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final String DATE_PATTERN_REPL = "$1\ue003$2\ue003$3";
    private static final Pattern BRACE_IN_WORD_PATTERN = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454\u0491])\\(([\u0430-\u044f\u0456\u0457\u0454\u0491']+)\\)", 66);
    private static final Pattern XML_TAG_PATTERN = Pattern.compile("<(/?[a-z_]+/?)>", 2);
    private static final Pattern INITIALS_DOT_PATTERN_SP_2 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\h\\v]{0,5}[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\h\\v]{0,5}[\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)");
    private static final Pattern INITIALS_DOT_PATTERN_SP_1 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\h\\v]{0,5}[\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)");
    private static final Pattern INITIALS_DOT_PATTERN_RSP_2 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)([\\h\\v]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.([\\h\\v]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.");
    private static final Pattern INITIALS_DOT_PATTERN_RSP_1 = Pattern.compile("([\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']+)([\\h\\v]?[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.");
    private static final String INITIALS_DOT_REPL_SP_2 = "$1\ue003\ue110$2\ue003\ue110$3";
    private static final String INITIALS_DOT_REPL_SP_1 = "$1\ue003\ue110$2";
    private static final String INITIALS_DOT_REPL_RSP_2 = "$1\ue110$2\ue003\ue110$3\ue003\ue110";
    private static final String INITIALS_DOT_REPL_RSP_1 = "$1\ue110$2\ue003\ue110";
    private static final Pattern ABBR_DOT_VO_PATTERN1 = Pattern.compile("([\u0432\u0412\u0443])\\.([\\h\\v]*\u043e)\\.");
    private static final Pattern ABBR_DOT_VO_PATTERN2 = Pattern.compile("(\u043a)\\.([\\h\\v]*\u0441)\\.");
    private static final Pattern ABBR_DOT_VO_PATTERN3 = Pattern.compile("(\u0447|\u0441\u0442)\\.([\\h\\v]*\u043b)\\.");
    private static final Pattern ABBR_DOT_TYS_PATTERN1 = Pattern.compile("([0-9I\u0406][\\h\\v]+)(\u0442\u0438\u0441|\u0430\u0440\u0442)\\.");
    private static final Pattern ABBR_DOT_TYS_PATTERN2 = Pattern.compile("(\u0442\u0438\u0441|\u0430\u0440\u0442)\\.([\\h\\v]+[\u0430-\u044f\u0456\u0457\u0454\u04910-9])");
    private static final Pattern ABBR_DOT_ART_PATTERN = Pattern.compile("([\u0410\u0430]\u0440\u0442|[\u041c\u043c]\u0430\u043b|[\u0420\u0440]\u0438\u0441)\\.([\\h]*[0-9])");
    private static final Pattern ABBR_DOT_MAN_PATTERN = Pattern.compile("(\u041c\u0430\u043d)\\.([\\h]*(\u0421\u0456\u0442\u0456|[\u042e\u044e]\u043d))");
    private static final Pattern ABBR_DOT_LAT_PATTERN = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'\u0301-]\u043b\u0430\u0442)\\.([\\h\\v]+[a-zA-Z])");
    private static final Pattern ABBR_DOT_PROF_PATTERN = Pattern.compile("(?<![\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'\u0301-])([\u0410\u0430]\u043a\u0430\u0434|[\u041f\u043f]\u0440\u043e\u0444|[\u0414\u0434]\u043e\u0446|[\u0410\u0430]\u0441\u0438\u0441\u0442|[\u0410\u0430]\u0440\u0445|\u0442\u043e\u0432|\u0432\u0443\u043b|\u043e|\u0440|\u0456\u043c|\u0443\u043f\u043e\u0440\u044f\u0434|[\u041f\u043f]\u0440\u0435\u043f|\u0406\u0432|\u0414\u0436)\\.([\\h\\v]+[\u0410-\u042f\u0406\u0407\u0404\u0490\u0430-\u044f\u0456\u0457\u0454\u0491])");
    private static final Pattern ABBR_DOT_GUB_PATTERN = Pattern.compile("(.[\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491'-]+[\\h\\v]+\u0433\u0443\u0431)\\.");
    private static final Pattern ABBR_DOT_DASH_PATTERN = Pattern.compile("\\b([\u0410-\u042f\u0406\u0407\u0404\u0490]\u0436?)\\.([-\u2013]([\u0410-\u042f\u0406\u0407\u0404\u0490][\u0430-\u044f\u0456\u0457\u0454\u0491']{2}|[\u0410-\u042f\u0406\u0407\u0404\u0490]\\.))");
    private static final Pattern ABBR_DOT_KUB_SM_PATTERN = Pattern.compile("(\u043a\u0432|\u043a\u0443\u0431)\\.([\\h\\v]*(?:[\u0441\u043c\u043a\u0434]|\u043c\u043a)?\u043c)");
    private static final Pattern ABBR_DOT_S_G_PATTERN = Pattern.compile("(\u0441)\\.(-\u0433)\\.");
    private static final Pattern ABBR_DOT_PN_ZAH_PATTERN = Pattern.compile("(\u043f\u043d|\u043f\u0434)\\.(-(\u0437\u0430\u0445|\u0441\u0445))\\.");
    private static final Pattern INVALID_MLN_DOT_PATTERN = Pattern.compile("(\u043c\u043b\u043d|\u043c\u043b\u0440\u0434)\\.( [\u0430-\u044f\u0456\u0457\u0454\u0491])");
    private static final Pattern ABBR_DOT_2_SMALL_LETTERS_PATTERN = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'\u0301-][\u0432\u0435\u043a\u043d\u043f\u0440\u0441\u0442\u0446\u0447]{1,2})\\.(\\h*(?![\u0441\u043c\u043a\u0434]?\u043c\\.)[\u0435\u043a\u043c\u043d\u043f\u0440\u0441\u0442\u0447]{1,2})\\.");
    private static final String ABBR_DOT_2_SMALL_LETTERS_REPL = "$1\ue003\ue110$2\ue003\ue110";
    private static final String ONE_DOT_TWO_REPL = "$1\ue003\ue110$2";
    private static final Pattern ABBR_DOT_NON_ENDING_PATTERN = Pattern.compile("(?<![\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'\u0301-])(\u0430\u0431\u0437|\u0430\u0432\u0441\u0442\u0440\u0430\u043b|\u0430\u043c\u0435\u0440|\u0430\u043d\u0433\u043b|\u0430\u043a\u0430\u0434(\u0435\u043c)?|\u0430\u0440\u043a|\u0430\u0443\u0434|\u0431\u043b(?:\u0438\u0437\u044c\u043a)?|\u0431\u0443\u0434|\u0432(?!\\.+)|\u0432\u0438\u043f|\u0432\u0456\u0440\u043c|\u0433\u0440\u0435\u0446(?:\u044c\u043a)|\u0434\u0435\u0440\u0436|\u0434\u0438\u0432|\u0434\u0456\u0430\u043b|\u0434\u043e\u0434|\u0434\u043e\u043b|\u0434\u043e\u0441\u043b|\u0434\u043e\u0446|\u0434\u043e\u043f|\u0435\u043a\u043e\u043d|\u0435\u043b|\u0436\u0456\u043d|\u0437\u0430\u0432|\u0437\u0430\u0441\u0442|\u0437\u0430\u0445|\u0437\u0431|\u0437\u0432|\u0437\u043d\u0435\u0432\u0430\u0436\u043b?|\u0437\u043e\u0432\u043d|\u0456\u043c|\u0456\u0432\u0440|\u0456\u0441\u043f|\u0456\u0441\u0442|\u0456\u0442\u0430\u043b|\u043a|\u043a\u0430\u0431|\u043a\u0430\u0444|\u043a\u0430\u043d\u0434|\u043a\u0432|[1-9]-\u043a\u0456\u043c\u043d|\u043a\u0456\u043c\u043d|\u043a\u043b|\u043a\u043d|\u043a\u043e\u0435\u0444|\u043b\u0430\u0442\u0438\u043d|\u043c\u0430\u043b|\u043c\u043e\u0431|\u043d|[\u041d\u043d]\u0430\u043f\u0440|\u043d\u0430\u0446|\u043e\u0431\u0440\u0430\u0437\u043d|\u043e\u043f|\u043e\u0444|\u043f|\u043f\u0435\u043d|\u043f\u0435\u0440\u0435\u043a\u043b|\u043f\u0435\u0440\u0435\u043d|\u043f\u043b|\u043f\u043e\u043b|\u043f\u043e\u0432|\u043f\u043e\u0440|\u043f\u043e\u0447|\u043f\u043f|\u043f\u0440\u0438\u0431\u043b|\u043f\u0440\u0438\u043a\u043c|\u043f\u0440\u0438\u043c|\u043f\u0440\u0438\u0441\u043b|\u043f\u0440\u043e\u0432|\u043f\u0440\u043e\u043c|\u043f\u0440\u043e\u0441\u043f|[\u0420\u0440]\u0435\u0434|[\u0420\u0440]\u0435\u0436|\u0440\u043e\u0437\u0434|\u0440\u043e\u0437\u043c|\u0440\u0442|\u0440\u0443\u043c|\u0441|[\u0421\u0441]\u0432\u0432?|\u0441\u043a\u043e\u0440|\u0441\u043e\u0446|\u0441\u043f\u0456\u0432\u0430\u0432\u0442|[\u0441\u0421]\u0442|\u0441\u0442\u043e\u0440|\u0441\u0445|\u0442\u0430\u0431\u043b|\u0442\u0442|[\u0442\u0422]\u0435\u043b|\u0442\u0435\u0445\u043d|\u0443\u043a\u0440|\u0444\u0456\u043b\u043e\u043b|\u0444\u0440|\u0444\u0440\u0430\u043d\u0446|\u0445\u0443\u0434|\u0447|\u0447\u0430\u0439\u043d|\u0447\u0430\u0441\u0442|\u0446|\u044f\u043f)\\.(?!\ue120|\\.+[\\h\\v]*$)");
    private static final Pattern ABBR_DOT_NON_ENDING_PATTERN_2 = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]\u043c\\.)([\\h\\v]*[\u0410-\u042f\u0406\u0407\u0404\u0490])");
    private static final Pattern ABBR_DOT_ENDING_PATTERN = Pattern.compile("([^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'\u0301-]((\u0442\u0430|\u0439|\u0456) (\u0456\u043d\u0448?|\u043f\u043e\u0434)|\u0430\u0442\u043c|\u0432\u0456\u0434\u0441|\u0433\u0440|\u043a\u043e\u043f|\u043e\u0431\u043b|\u0440|\u0440\u0440|\u0420\u0420|\u0440\u0443\u0431|\u0441\u0442|\u0441\u0442\u043e\u043b|\u0441\u0442\u043e\u0440|\u0447\u043e\u043b|\u0448\u0442))\\.(?!\ue120)");
    private static final Pattern ABBR_DOT_I_T_P_PATTERN = Pattern.compile("([\u0456\u0439][\\h\\v]+\u0442\\.)([\\h\\v]*(\u0434|\u043f|\u0456\u043d)\\.)");
    private static final Pattern ABBR_DOT_I_T_CH_PATTERN = Pattern.compile("([\u0432\u0443][\\h\\v]+\u0442\\.)([\\h\\v]*\u0447\\.)");
    private static final Pattern ABBR_DOT_T_ZV_PATTERN = Pattern.compile("([\\h\\v]+\u0442\\.)([\\h\\v]*\u0437\u0432\\.)");
    private static final Pattern ABBR_AT_THE_END = Pattern.compile("(?<![\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'\u0301])(\u0442\u0438\u0441|\u0433\u0443\u0431|[\u0410-\u042f\u0406\u0407\u0404\u0490])\\.[\\h\\v]*$");
    private static final Pattern APOSTROPHE_BEGIN_PATTERN = Pattern.compile("(^|[\\h\\v(\u201e\u00ab\"'])'(?!\u0434\u043d\u043e)(\\p{L})");
    private static final Pattern APOSTROPHE_END_PATTER = Pattern.compile("(\\p{L})(?<!\\b(?:\u043c\u043e|\u0442\u0440\u0435|\u0442\u0440\u0430|\u0447\u043e|\u043d\u0456\u0447\u043e|\u0431\u043e|\u0437\u0430\u0440\u0430|\u043f\u0440\u0430))'([^\\p{L}-]|$)", 66);
    private static final Pattern YEAR_WITH_R = Pattern.compile("((?:[12][0-9]{3}[\u2014\u2013-])?[12][0-9]{3})(\u0440\u0440?\\.)");
    private static final Pattern COMPOUND_WITH_QUOTES1 = Pattern.compile("([\u0430-\u044f\u0456\u0457\u0454]-)([\u00ab\"\u201e])([\u0430-\u044f\u0456\u0457\u0454\u0491'-]+)([\u00bb\"\u201c])", 66);
    private static final Pattern COMPOUND_WITH_QUOTES2 = Pattern.compile("([\u00ab\"\u201e])([\u0430-\u044f\u0456\u0457\u0454\u04910-9'-]+)([\u00bb\\\"\u201c])(-[\u0430-\u044f\u0456\u0457\u0454])", 66);
    private static final Pattern ABBR_DOT_RED_AVT_PATTERN = Pattern.compile("([\\h\\v]+(?:[\u0420\u0440]\u0435\u0434|[\u0410\u0430]\u0432\u0442))\\.([\\)\\]])");
    private static final String SOFT_HYPHEN_WRAP = "\u00ad\n";
    private static final String SOFT_HYPHEN_WRAP_SUBST = "\ue103";
    private static final Pattern URL_PATTERN = Pattern.compile("((https?|ftp)://|www\\.)[^\\h\\v/$.?#),]+\\.[^\\h\\v),\">]*|(mailto:)?[\\p{L}\\d._-]+@[\\p{L}\\d_-]+(\\.[\\p{L}\\d_-]+)+", 2);
    private static final int URL_START_REPLACE_CHAR = 58112;
    private static final Pattern LEADING_DASH_PATTERN = Pattern.compile("^([\u2014\u2013])([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490A-Z])");
    private static final Pattern LEADING_DASH_PATTERN_2 = Pattern.compile("^(-)([\u0410-\u042f\u0406\u0407\u0404\u0490A-Z])");
    private static final Pattern NUMBER_MISSING_SPACE = Pattern.compile("((?:[\\h\\v\ue110]|^)[\u0430-\u044f\u0457\u0456\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490'-]*[\u0430-\u044f\u0457\u0456\u0454\u0491']?[\u0430-\u044f\u0457\u0456\u0454\u0491])([0-9]+(?![\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490a-zA-Z\u00bb\"\u201c]))");

    public List<String> tokenize(String text) {
        HashMap<String, String> urls = new HashMap<String, String>();
        if (!text.trim().isEmpty()) {
            text = this.adjustTextForTokenizing(text, urls);
        }
        ArrayList<String> tokenList = new ArrayList<String>();
        List<String> tokens = UkrainianWordTokenizer.splitWithDelimiters(text, SPLIT_CHARS_REGEX);
        for (String token : tokens) {
            if (token.equals(BREAKING_PLACEHOLDER)) continue;
            token = token.replace('\ue001', ',');
            token = token.replace('\ue007', '/');
            token = token.replace('\ue004', ':');
            token = token.replace('\ue002', ' ');
            token = token.replace('\ue005', '(');
            token = token.replace('\ue006', ')');
            token = token.replace('\ue008', '<');
            token = token.replace('\ue009', '>');
            token = token.replace('\ue010', '/');
            token = token.replace('\ue003', '.');
            token = token.replace(SOFT_HYPHEN_WRAP_SUBST, SOFT_HYPHEN_WRAP);
            token = token.replace(NON_BREAKING_PLACEHOLDER, "");
            token = token.replace(NON_BREAKING_PLACEHOLDER2, "");
            if (!urls.isEmpty()) {
                for (Map.Entry<String, String> entry : urls.entrySet()) {
                    token = token.replace(entry.getKey(), entry.getValue());
                }
            }
            tokenList.add(token);
        }
        return tokenList;
    }

    private String adjustTextForTokenizing(String text, HashMap<String, String> urls) {
        Matcher spacedDecimalMatcher;
        boolean dotInsideSentence;
        Matcher matcher;
        boolean nDashPresent;
        Matcher matcher2;
        if ("\u2014\u2013-".indexOf((text = UkrainianWordTokenizer.cleanup(text)).charAt(0)) >= 0) {
            matcher2 = LEADING_DASH_PATTERN.matcher(text);
            if (matcher2.find()) {
                text = matcher2.replaceFirst(N_DASH_SPACE_REPL);
            } else {
                matcher2 = LEADING_DASH_PATTERN_2.matcher(text);
                if (matcher2.find()) {
                    text = matcher2.replaceFirst(N_DASH_SPACE_REPL);
                }
            }
        }
        if (text.contains(",")) {
            text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL);
        }
        if (text.contains("http") || text.contains("www") || text.contains("@") || text.contains("ftp")) {
            matcher2 = URL_PATTERN.matcher(text);
            int urlReplaceChar = 58112;
            while (matcher2.find()) {
                String urlGroup = matcher2.group();
                String replaceChar = String.valueOf((char)urlReplaceChar);
                urls.put(replaceChar, urlGroup);
                text = matcher2.replaceFirst(replaceChar);
                ++urlReplaceChar;
                matcher2 = URL_PATTERN.matcher(text);
            }
        }
        if (text.indexOf(8212) != -1) {
            text = text.replaceAll("\u2014([\\h\\v])", "\ue110\u2014$1");
        }
        boolean bl = nDashPresent = text.indexOf(8211) != -1;
        if (text.indexOf(45) != -1 || nDashPresent) {
            text = DASH_NUMBERS_PATTERN.matcher(text).replaceAll(DASH_NUMBERS_REPL);
            if (nDashPresent) {
                text = N_DASH_SPACE_PATTERN.matcher(text).replaceAll(N_DASH_SPACE_REPL);
                text = N_DASH_SPACE_PATTERN2.matcher(text).replaceAll(N_DASH_SPACE_REPL);
            }
        }
        if (text.indexOf("\u0441/\u0433") != -1) {
            text = text.replaceAll("\u0441/\u0433", "\u0441\ue007\u0433");
        }
        if (text.indexOf("\u041b/\u0414\u041d\u0420") != -1) {
            text = text.replaceAll("\u041b/\u0414\u041d\u0420", "\u041b\ue007\u0414\u041d\u0420");
        }
        if (text.indexOf("\u0440.") != -1 && (matcher = YEAR_WITH_R.matcher(text)).find()) {
            text = matcher.replaceAll(N_DASH_SPACE_REPL);
        }
        if ((text = text.replace("#", "\ue110#")).indexOf(37) >= 0) {
            text = text.replaceAll("%([^-])", "%\ue110$1");
        }
        text = COMPOUND_WITH_QUOTES1.matcher(text).replaceAll("$1$2\ue120$3\ue120$4\ue120");
        if ((text = COMPOUND_WITH_QUOTES2.matcher(text).replaceAll("$1\ue120$2\ue120$3\ue120$4")).indexOf(91) != -1) {
            text = WORDS_WITH_BRACKETS_PATTERN.matcher(text).replaceAll("$1\\[\ue120$2\\]\ue120");
        }
        int dotIndex = text.indexOf(46);
        String textRtrimmed = text.replaceFirst("[\\h\\v]*$", "");
        boolean bl2 = dotInsideSentence = dotIndex >= 0 && dotIndex < textRtrimmed.length() - 1;
        if (dotInsideSentence || dotIndex == textRtrimmed.length() - 1 && ABBR_AT_THE_END.matcher(text).find()) {
            text = DATE_PATTERN.matcher(text).replaceAll(DATE_PATTERN_REPL);
            text = DOTTED_NUMBERS_PATTERN3.matcher(text).replaceAll("$1.\ue120$2.\ue120$3");
            text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1.\ue120$2");
            text = ABBR_DOT_2_SMALL_LETTERS_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110$2.\ue120\ue110");
            text = ABBR_DOT_VO_PATTERN1.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_VO_PATTERN2.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_VO_PATTERN3.matcher(text).replaceAll(ABBR_DOT_2_SMALL_LETTERS_REPL);
            text = ABBR_DOT_ART_PATTERN.matcher(text).replaceAll("$1\ue003\ue110$2");
            text = ABBR_DOT_MAN_PATTERN.matcher(text).replaceAll("$1\ue003\ue110$2");
            text = ABBR_DOT_TYS_PATTERN1.matcher(text).replaceAll("$1$2\ue003\ue110");
            text = ABBR_DOT_TYS_PATTERN2.matcher(text).replaceAll("$1\ue003\ue110$2");
            text = ABBR_DOT_LAT_PATTERN.matcher(text).replaceAll("$1\ue003\ue110$2");
            text = ABBR_DOT_PROF_PATTERN.matcher(text).replaceAll("$1\ue003\ue110$2");
            text = ABBR_DOT_GUB_PATTERN.matcher(text).replaceAll("$1\ue003\ue110");
            text = ABBR_DOT_DASH_PATTERN.matcher(text).replaceAll("$1\ue003$2");
            text = INITIALS_DOT_PATTERN_SP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_SP_2);
            text = INITIALS_DOT_PATTERN_SP_1.matcher(text).replaceAll("$1\ue003\ue110$2");
            text = INITIALS_DOT_PATTERN_RSP_2.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_2);
            text = INITIALS_DOT_PATTERN_RSP_1.matcher(text).replaceAll(INITIALS_DOT_REPL_RSP_1);
            text = ABBR_DOT_KUB_SM_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110$2");
            text = ABBR_DOT_S_G_PATTERN.matcher(text).replaceAll("$1\ue003$2\ue003\ue110");
            text = ABBR_DOT_PN_ZAH_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110$2.\ue120\ue110");
            text = ABBR_DOT_I_T_P_PATTERN.matcher(text).replaceAll("$1\ue120\ue110$2\ue120\ue110");
            text = ABBR_DOT_I_T_CH_PATTERN.matcher(text).replaceAll("$1\ue120\ue110$2\ue120\ue110");
            text = ABBR_DOT_T_ZV_PATTERN.matcher(text).replaceAll("$1\ue120\ue110$2\ue120\ue110");
            text = ABBR_DOT_RED_AVT_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110$2");
            text = ABBR_DOT_NON_ENDING_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110");
            text = ABBR_DOT_NON_ENDING_PATTERN_2.matcher(text).replaceAll("$1\ue120\ue110$2");
            text = INVALID_MLN_DOT_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110$2");
        }
        if (text.contains("*")) {
            text = text.replaceAll("((?:^|[^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])\\*+)([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])", N_DASH_SPACE_REPL);
            text = text.replaceAll("([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])(\\*+(?:[^\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490]|$))", N_DASH_SPACE_REPL);
        }
        if ((spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text = ABBR_DOT_ENDING_PATTERN.matcher(text).replaceAll("$1.\ue120\ue110"))).find()) {
            StringBuffer sb = new StringBuffer();
            do {
                String splitNumber = spacedDecimalMatcher.group(0);
                String splitNumberAdjusted = splitNumber.replace(' ', '\ue002');
                splitNumberAdjusted = splitNumberAdjusted.replace('\u00a0', '\ue002');
                splitNumberAdjusted = splitNumberAdjusted.replace('\u202f', '\ue002');
                spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
            } while (spacedDecimalMatcher.find());
            spacedDecimalMatcher.appendTail(sb);
            text = sb.toString();
        }
        if (text.contains(":")) {
            text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL);
        }
        if (text.contains("(")) {
            text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1\ue005$2\ue006");
        }
        if (text.contains("<")) {
            text = XML_TAG_PATTERN.matcher(text).replaceAll("\ue110\ue008$1\ue009\ue110");
            text = text.replace("\ue008/", "\ue008\ue010");
            text = text.replace("/\ue009", "\ue010\ue009");
        }
        if (text.contains("-")) {
            text = text.replaceAll("([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])([\u00bb\"-]+-)", N_DASH_SPACE_REPL);
            text = text.replaceAll("([\u00bb\"-]+-)([\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])", N_DASH_SPACE_REPL);
        }
        if (text.contains(SOFT_HYPHEN_WRAP)) {
            text = text.replaceAll("(?<!\\s)\u00ad\n", SOFT_HYPHEN_WRAP_SUBST);
        }
        if (text.indexOf(39) >= 0) {
            text = APOSTROPHE_BEGIN_PATTERN.matcher(text).replaceAll("$1'\ue110$2");
            text = APOSTROPHE_END_PATTER.matcher(text).replaceAll("$1\ue110'$2");
        }
        if (text.contains("+")) {
            text = text.replaceAll("\\+(?=[\u0430-\u044f\u0456\u0457\u0454\u0491\u0410-\u042f\u0406\u0407\u0404\u0490])", "\ue110+\ue110");
        }
        text = NUMBER_MISSING_SPACE.matcher(text).replaceAll(N_DASH_SPACE_REPL);
        return text;
    }

    private static String cleanup(String text) {
        text = text.replace('\u2019', '\'').replace('\u02bc', '\'').replace('\u2018', '\'').replace('\u201a', ',').replace('\u2011', '-');
        text = WEIRD_APOSTROPH_PATTERN.matcher(text).replaceAll("$1\ue120$2\ue120$3");
        return text;
    }

    private static List<String> splitWithDelimiters(String str, Pattern delimPattern) {
        ArrayList<String> parts = new ArrayList<String>();
        Matcher matcher = delimPattern.matcher(str);
        int lastEnd = 0;
        while (matcher.find()) {
            int start = matcher.start();
            if (lastEnd != start) {
                String nonDelim = str.substring(lastEnd, start);
                parts.add(nonDelim);
            }
            String delim = matcher.group();
            parts.add(delim);
            lastEnd = matcher.end();
        }
        if (lastEnd != str.length()) {
            String nonDelim = str.substring(lastEnd);
            parts.add(nonDelim);
        }
        return parts;
    }
}

