/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.ca;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.ca.CatalanTagger;
import org.languagetool.tokenizers.WordTokenizer;

public class CatalanWordTokenizer
extends WordTokenizer {
    public static final CatalanWordTokenizer INSTANCE = new CatalanWordTokenizer();
    private static final String wordCharacters = "\u00a7\u00a9@\u20ac\u00a3\\$_\\p{L}\\d\u00b7\\-\u0300-\u036f\u00a8\u2070-\u209f\u00b0%\u2030\u2031&\ufffd\u00ad\u00ac";
    private static final Pattern tokenizerPattern = Pattern.compile("[\u00a7\u00a9@\u20ac\u00a3\\$_\\p{L}\\d\u00b7\\-\u0300-\u036f\u00a8\u2070-\u209f\u00b0%\u2030\u2031&\ufffd\u00ad\u00ac]+|[^\u00a7\u00a9@\u20ac\u00a3\\$_\\p{L}\\d\u00b7\\-\u0300-\u036f\u00a8\u2070-\u209f\u00b0%\u2030\u2031&\ufffd\u00ad\u00ac]");
    private static final String PF = "(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)";
    private static final Pattern PATTERN_1 = Pattern.compile("xxCA_APOS_RECTExx", 16);
    private static final Pattern PATTERN_2 = Pattern.compile("xxCA_APOS_RODOxx", 16);
    private static final Pattern PATTERN_3 = Pattern.compile("xxCA_HYPHENxx", 16);
    private static final Pattern PATTERN_4 = Pattern.compile("xxCA_DECIMALPOINTxx", 16);
    private static final Pattern PATTERN_5 = Pattern.compile("xxCA_DECIMALCOMMAxx", 16);
    private static final Pattern PATTERN_6 = Pattern.compile("xxCA_SPACExx", 16);
    private static final Pattern PATTERN_7 = Pattern.compile("xxELA_GEMINADAxx", 16);
    private static final Pattern PATTERN_8 = Pattern.compile("xxELA_GEMINADA_UPPERCASExx", 16);
    private static final int maxPatterns = 11;
    private final Pattern[] patterns = new Pattern[11];
    private static final Pattern ELA_GEMINADA = Pattern.compile("([aeiou\u00e0\u00e9\u00e8\u00ed\u00f3\u00f2\u00fa\u00ef\u00fcAEIOU\u00c0\u00c8\u00c9\u00cd\u00d2\u00d3\u00da\u00cf\u00dc])l[\\.\u2022\u22c5\u2219\uf0d7]l([aeiou\u00e0\u00e9\u00e8\u00ed\u00f3\u00f2\u00fa\u00ef\u00fc])", 64);
    private static final Pattern ELA_GEMINADA_UPPERCASE = Pattern.compile("([AEIOU\u00c0\u00c8\u00c9\u00cd\u00d2\u00d3\u00da\u00cf\u00dc])L[\\.\u2022\u22c5\u2219\uf0d7]L([AEIOU\u00c0\u00c8\u00c9\u00cd\u00d2\u00d3\u00da\u00cf\u00dc])", 64);
    private static final Pattern APOSTROF_RECTE = Pattern.compile("([\\p{L}])'([\\p{L}\"\u2018\u201c\u00ab])", 66);
    private static final Pattern APOSTROF_RODO = Pattern.compile("([\\p{L}])\u2019([\\p{L}\"\u2018\u201c\u00ab])", 66);
    private static final Pattern APOSTROF_RECTE_1 = Pattern.compile("([dlDL])'(\\d[\\d\\s\\.,]?)", 66);
    private static final Pattern APOSTROF_RODO_1 = Pattern.compile("([dlDL])\u2019(\\d[\\d\\s\\.,]?)", 66);
    private static final Pattern DECIMAL_POINT = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final Pattern DECIMAL_COMMA = Pattern.compile("([\\d]),([\\d])", 66);
    private static final Pattern SPACE_DIGITS0 = Pattern.compile("([\\d]{4}) ", 66);
    private static final Pattern SPACE_DIGITS = Pattern.compile("([\\d]) ([\\d][\\d][\\d])", 66);
    private static final Pattern SPACE_DIGITS2 = Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])", 66);
    private static final Pattern HYPHEN_L = Pattern.compile("([\\p{L}]+)(-)([Ll]['\u2019])([\\p{L}]+)", 66);

    public CatalanWordTokenizer() {
        this.patterns[0] = Pattern.compile("^([lnmtsd]['\u2019])([^'\u2019\\-]*)$", 66);
        this.patterns[1] = Pattern.compile("^(qui-sap-lo|qui-sap-la|qui-sap-los|qui-sap-les)|(Castella)(-)(la)$", 66);
        this.patterns[2] = Pattern.compile("^([lnmtsd]['\u2019])(.{2,})(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[3] = Pattern.compile("^(.{2,})(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[4] = Pattern.compile("^([lnmtsd]['\u2019])(.{2,})(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[5] = Pattern.compile("^(.{2,})(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[6] = Pattern.compile("^([lnmtsd]['\u2019])(.{2,})(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[7] = Pattern.compile("^(.+[^wo])(['\u2019]en|['\u2019]hi|['\u2019]ho|['\u2019]l|['\u2019]ls|['\u2019]m|['\u2019]n|['\u2019]ns|['\u2019]s|['\u2019]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[8] = Pattern.compile("^([lnmtsd]['\u2019])(.*)$", 66);
        this.patterns[9] = Pattern.compile("^(a|de|pe)(ls?)$", 66);
        this.patterns[10] = Pattern.compile("^(ca)(n)$", 66);
    }

    public List<String> tokenize(String text) {
        ArrayList<Object> l = new ArrayList<Object>();
        String auxText = text.replace('\u2010', '-');
        auxText = auxText.replace('\u2011', '-');
        Matcher matcher = ELA_GEMINADA.matcher(auxText);
        auxText = matcher.replaceAll("$1xxELA_GEMINADAxx$2");
        matcher = ELA_GEMINADA_UPPERCASE.matcher(auxText);
        auxText = matcher.replaceAll("$1xxELA_GEMINADA_UPPERCASExx$2");
        matcher = APOSTROF_RECTE.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_APOS_RECTExx$2");
        matcher = APOSTROF_RECTE_1.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_APOS_RECTExx$2");
        matcher = APOSTROF_RODO.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_APOS_RODOxx$2");
        matcher = APOSTROF_RODO_1.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_APOS_RODOxx$2");
        matcher = DECIMAL_POINT.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_DECIMALPOINTxx$2");
        matcher = DECIMAL_COMMA.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_DECIMALCOMMAxx$2");
        matcher = SPACE_DIGITS0.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_SPACE0xx");
        matcher = SPACE_DIGITS2.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_SPACExx$2xxCA_SPACExx$3");
        matcher = SPACE_DIGITS.matcher(auxText);
        auxText = matcher.replaceAll("$1xxCA_SPACExx$2");
        auxText = auxText.replace("xxCA_SPACE0xx", " ");
        Matcher tokenizerMatcher = tokenizerPattern.matcher(auxText);
        while (tokenizerMatcher.find()) {
            String s = tokenizerMatcher.group();
            if (l.size() > 0 && s.length() == 1 && s.codePointAt(0) >= 65024 && s.codePointAt(0) <= 65039) {
                l.set(l.size() - 1, (String)l.get(l.size() - 1) + s);
                continue;
            }
            s = PATTERN_1.matcher(s).replaceAll("'");
            s = PATTERN_2.matcher(s).replaceAll("\u2019");
            s = PATTERN_3.matcher(s).replaceAll("-");
            s = PATTERN_4.matcher(s).replaceAll(".");
            s = PATTERN_5.matcher(s).replaceAll(",");
            s = PATTERN_6.matcher(s).replaceAll(" ");
            s = PATTERN_7.matcher(s).replaceAll("l.l");
            s = PATTERN_8.matcher(s).replaceAll("L.L");
            boolean matchFound = false;
            while (s.length() > 1 && s.startsWith("-")) {
                l.add("-");
                s = s.substring(1);
            }
            int hyphensAtEnd = 0;
            while (s.length() > 1 && s.endsWith("-")) {
                s = s.substring(0, s.length() - 1);
                ++hyphensAtEnd;
            }
            for (int j = 0; j < 11 && !matchFound; ++j) {
                matcher = this.patterns[j].matcher(s);
                matchFound = matcher.find();
            }
            if (matchFound) {
                for (int i = 1; i <= matcher.groupCount(); ++i) {
                    String groupStr = matcher.group(i);
                    if (groupStr == null) continue;
                    l.addAll(this.wordsToAdd(groupStr));
                }
            } else {
                l.addAll(this.wordsToAdd(s));
            }
            while (hyphensAtEnd > 0) {
                l.add("-");
                --hyphensAtEnd;
            }
        }
        return this.joinEMailsAndUrls(l);
    }

    private List<String> wordsToAdd(String s) {
        ArrayList<String> l;
        block14: {
            l = new ArrayList<String>();
            if (s.isEmpty()) break block14;
            if (!(s.contains("-") || s.endsWith("'") || s.endsWith("\u2019"))) {
                l.add(s);
            } else if (CatalanTagger.INSTANCE_CAT.tag(Arrays.asList(s.replace("\u00ad", "").replace("\u2019", "'"))).get(0).isTagged()) {
                l.add(s);
            } else if (s.equalsIgnoreCase("mers-cov") || s.equalsIgnoreCase("mcgraw-hill") || s.equalsIgnoreCase("sars-cov-2") || s.equalsIgnoreCase("sars-cov") || s.equalsIgnoreCase("ph-metre") || s.equalsIgnoreCase("ph-metres")) {
                l.add(s);
            } else if (CatalanTagger.INSTANCE_CAT.tag(Arrays.asList(s.replace("\u00ad", "").replace("l-l", "l\u00b7l"))).get(0).isTagged()) {
                l.add(s);
            } else if ((s.endsWith("'") || s.endsWith("\u2019")) && s.length() > 1) {
                l.addAll(this.wordsToAdd(s.substring(0, s.length() - 1)));
                l.add(s.substring(s.length() - 1));
            } else {
                Matcher matcher = HYPHEN_L.matcher(s);
                if (matcher.matches()) {
                    for (int i = 1; i <= matcher.groupCount(); ++i) {
                        String groupStr = matcher.group(i);
                        l.addAll(this.wordsToAdd(groupStr));
                    }
                } else {
                    StringTokenizer st2 = new StringTokenizer(s, "-", true);
                    while (st2.hasMoreElements()) {
                        l.add(st2.nextToken());
                    }
                }
            }
        }
        return l;
    }
}

