/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.pt;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.pt.PortugueseTagger;
import org.languagetool.tokenizers.WordTokenizer;

public class PortugueseWordTokenizer
extends WordTokenizer {
    private final PortugueseTagger tagger = new PortugueseTagger();
    private static final String SPLIT_CHARS = " -\u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2013\u2014\u2015\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb*+\u00d7\u2217\u00b7\u00f7:=\u2260\u2242\u2243\u2244\u2245\u2246\u2247\u2248\u2249\u2264\u2265\u226a\u226b\u2227\u2228\u2229\u222a\u2208\u2209\u220a\u220b\u220c\u220d,.;()[]{}<>!?:/\\\"'\u00ab\u00bb\u201e\u201d\u201c\u2018`\u2019\u2026\u00bf\u00a1\t\n\r";
    private static final char DECIMAL_COMMA_SUBST = '\ue001';
    private static final char NON_BREAKING_SPACE_SUBST = '\ue002';
    private static final char NON_BREAKING_DOT_SUBST = '\ue003';
    private static final char NON_BREAKING_COLON_SUBST = '\ue004';
    private static final String HYPHEN_SUBST = "\u0001\u0001PT_HYPHEN\u0001\u0001";
    private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", 66);
    private static final String DECIMAL_COMMA_REPL = "$1\ue001$2";
    private static final Pattern DECIMAL_SPACE_PATTERN = Pattern.compile("(?<=^|[\\s(])\\d{1,3}( [\\d]{3})+(?=[\\s(]|$)", 66);
    private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final String DOTTED_NUMBERS_REPL = "$1\ue003$2";
    private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", 66);
    private static final String COLON_NUMBERS_REPL = "$1\ue004$2";
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", 66);
    private static final String DATE_PATTERN_REPL = "$1\ue003$2\ue003$3";
    private static final Pattern DOTTED_ORDINALS_PATTERN = Pattern.compile("([\\d])\\.([ao\u00aa\u00ba][s\u02e2]?)", 66);
    private static final String DOTTED_ORDINALS_REPL = "$1\ue003$2";
    private static final Pattern HYPHEN_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}\\d])", 66);
    private static final String HYPHEN_REPL = "$1\u0001\u0001PT_HYPHEN\u0001\u0001$2";
    private static final Pattern NEARBY_HYPHENS_PATTERN = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", 66);
    private static final String NEARBY_HYPHENS_REPL = "$1\u0001\u0001PT_HYPHEN\u0001\u0001$2\u0001\u0001PT_HYPHEN\u0001\u0001$3";

    public List<String> tokenize(String text) {
        Matcher spacedDecimalMatcher;
        int dotIndex;
        boolean dotInsideSentence;
        if (text.contains(",")) {
            text = DECIMAL_COMMA_PATTERN.matcher(text).replaceAll(DECIMAL_COMMA_REPL);
        }
        boolean bl = dotInsideSentence = (dotIndex = text.indexOf(46)) >= 0 && dotIndex < text.length() - 1;
        if (dotInsideSentence) {
            text = DATE_PATTERN.matcher(text).replaceAll(DATE_PATTERN_REPL);
            text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1\ue003$2");
            text = DOTTED_ORDINALS_PATTERN.matcher(text).replaceAll("$1\ue003$2");
        }
        if ((spacedDecimalMatcher = DECIMAL_SPACE_PATTERN.matcher(text)).find()) {
            StringBuffer sb = new StringBuffer();
            do {
                String splitNumber = spacedDecimalMatcher.group(0);
                String splitNumberAdjusted = splitNumber.replace(' ', '\ue002');
                splitNumberAdjusted = splitNumberAdjusted.replace('\u00a0', '\ue002');
                spacedDecimalMatcher.appendReplacement(sb, splitNumberAdjusted);
            } while (spacedDecimalMatcher.find());
            spacedDecimalMatcher.appendTail(sb);
            text = sb.toString();
        }
        if (text.contains(":")) {
            text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll(COLON_NUMBERS_REPL);
        }
        if (text.contains("-")) {
            text = NEARBY_HYPHENS_PATTERN.matcher(text).replaceAll(NEARBY_HYPHENS_REPL);
            text = HYPHEN_PATTERN.matcher(text).replaceAll(HYPHEN_REPL);
        }
        ArrayList<String> tokenList = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
        while (st.hasMoreElements()) {
            String token = st.nextToken();
            token = token.replace('\ue001', ',');
            token = token.replace('\ue004', ':');
            token = token.replace('\ue002', ' ');
            token = token.replace('\ue003', '.');
            token = token.replaceAll(HYPHEN_SUBST, "-");
            tokenList.addAll(this.wordsToAdd(token));
        }
        return this.joinEMailsAndUrls(tokenList);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private List<String> wordsToAdd(String s) {
        ArrayList<String> l = new ArrayList<String>();
        PortugueseWordTokenizer portugueseWordTokenizer = this;
        synchronized (portugueseWordTokenizer) {
            if (!s.isEmpty()) {
                if (!s.contains("-")) {
                    l.add(s);
                } else if (this.tagger.tag(Arrays.asList(s.replace("\u2019", "'"))).get(0).isTagged()) {
                    l.add(s);
                } else if (s.equalsIgnoreCase("mers-cov") || s.equalsIgnoreCase("mcgraw-hill") || s.equalsIgnoreCase("sars-cov-2") || s.equalsIgnoreCase("sars-cov") || s.equalsIgnoreCase("ph-metre") || s.equalsIgnoreCase("ph-metres") || s.equalsIgnoreCase("anti-ivg") || s.equalsIgnoreCase("anti-uv") || s.equalsIgnoreCase("anti-vih") || s.equalsIgnoreCase("al-qa\u00efda")) {
                    l.add(s);
                } else {
                    StringTokenizer st2 = new StringTokenizer(s, "-", true);
                    while (st2.hasMoreElements()) {
                        l.add(st2.nextToken());
                    }
                }
            }
            return l;
        }
    }
}

