/*
 * Decompiled with CFR 0.152.
 */
package eus.ixa.ixa.pipe.ml.tok;

import eus.ixa.ixa.pipe.ml.tok.RuleBasedSegmenter;
import eus.ixa.ixa.pipe.ml.tok.RuleBasedTokenizer;
import eus.ixa.ixa.pipe.ml.utils.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NonPeriodBreaker {
    public static Pattern nonSegmentedWords = Pattern.compile("([\\p{Alnum}\\.\\-]*)(" + RuleBasedSegmenter.FINAL_PUNCT + "*)(\\.+)$", 256);
    public static Pattern nextCandidateWord = Pattern.compile("([\\ ]*" + RuleBasedSegmenter.INITIAL_PUNCT + "*[\\ ]*[\\p{Lu}\\p{Digit}])", 256);
    public static String NON_BREAKER_DIGITS = "(al|[Aa]rt|ca|figs?|[Nn]os?|[Nn]rs?|op|p|pp|[Pp]\u00e1g)";
    public static Pattern acronym = Pattern.compile("(\\.)[\\p{Lu}\\-]+([\\.]+)$", 256);
    public static Pattern numbers = Pattern.compile("(\\p{Digit}+[\\.])[\\ ]*(\\p{Digit}+)", 256);
    public static Pattern wordDot = Pattern.compile("^(\\S+)\\.$");
    public static Pattern alphabetic = Pattern.compile("\\p{Alpha}", 256);
    public static Pattern startLower = Pattern.compile("^\\p{Lower}+", 256);
    public static Pattern startPunct = Pattern.compile("^[\\!#\\$%&\\(\\)\\*\\+,-\\/:;=>\\?@\\[\\\\\\]\\^\\{\\|\\}~]");
    public static Pattern startDigit = Pattern.compile("^\\p{Digit}+", 256);
    private String NON_BREAKER = null;

    public NonPeriodBreaker(Properties properties) {
        this.loadNonBreaker(properties);
    }

    private void loadNonBreaker(Properties properties) {
        String lang = properties.getProperty("language");
        if (this.NON_BREAKER == null) {
            this.createNonBreaker(lang);
        }
    }

    private void createNonBreaker(String lang) {
        ArrayList<String> nonBreakerList = new ArrayList<String>();
        InputStream nonBreakerInputStream = this.getNonBreakerInputStream(lang);
        if (nonBreakerInputStream == null) {
            System.err.println("WARNING: No exceptions file for language " + lang + " in ixa-pipe-ml/src/main/resources/tokenizer/!!");
            System.exit(1);
        }
        BufferedReader breader = new BufferedReader(new InputStreamReader(nonBreakerInputStream));
        try {
            String line;
            while ((line = breader.readLine()) != null) {
                if ((line = line.trim()).startsWith("#")) continue;
                nonBreakerList.add(line);
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        this.NON_BREAKER = StringUtils.createDisjunctRegexFromList(nonBreakerList);
    }

    private final InputStream getNonBreakerInputStream(String lang) {
        InputStream nonBreakerInputStream = null;
        if (lang.equalsIgnoreCase("ca")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/ca-exceptions.txt");
        } else if (lang.equalsIgnoreCase("de")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/de-exceptions.txt");
        } else if (lang.equalsIgnoreCase("en")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/en-exceptions.txt");
        } else if (lang.equalsIgnoreCase("es")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/es-exceptions.txt");
        } else if (lang.equalsIgnoreCase("eu")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/eu-exceptions.txt");
        } else if (lang.equalsIgnoreCase("fr")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/fr-exceptions.txt");
        } else if (lang.equalsIgnoreCase("gl")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/gl-exceptions.txt");
        } else if (lang.equalsIgnoreCase("it")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/it-exceptions.txt");
        } else if (lang.equalsIgnoreCase("nl")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/nl-exceptions.txt");
        } else if (lang.equalsIgnoreCase("pt")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/pt-exceptions.txt");
        } else if (lang.equalsIgnoreCase("ru")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/tokenizer/ru-exceptions.txt");
        }
        return nonBreakerInputStream;
    }

    public String[] segmenterExceptions(String[] lines) {
        ArrayList<String> sentences = new ArrayList<String>();
        for (String line : lines) {
            String[] lineSentences;
            String segmentedLine = this.segmenterNonBreaker(line);
            for (String lineSentence : lineSentences = segmentedLine.split("\n")) {
                sentences.add(lineSentence);
            }
        }
        return sentences.toArray(new String[sentences.size()]);
    }

    private String segmenterNonBreaker(String line) {
        int i;
        line = line.trim();
        line = RuleBasedTokenizer.doubleSpaces.matcher(line).replaceAll(" ");
        StringBuilder sb = new StringBuilder();
        String segmentedText = "";
        String[] words = line.split(" ");
        for (i = 0; i < words.length - 1; ++i) {
            Matcher nonSegmentedWordMatcher = nonSegmentedWords.matcher(words[i]);
            if (nonSegmentedWordMatcher.find()) {
                String curWord = nonSegmentedWordMatcher.replaceAll("$1");
                String finalPunct = nonSegmentedWordMatcher.replaceAll("$2");
                if (!(!curWord.isEmpty() && curWord.matches("(" + this.NON_BREAKER + ")") && finalPunct.isEmpty() || acronym.matcher(words[i]).find() || !nextCandidateWord.matcher(words[i + 1]).find() || !curWord.isEmpty() && curWord.matches(NON_BREAKER_DIGITS) && finalPunct.isEmpty() && startDigit.matcher(words[i + 1]).find())) {
                    words[i] = words[i] + "\n";
                }
            }
            sb.append(words[i]).append(" ");
            segmentedText = sb.toString();
        }
        segmentedText = segmentedText + words[i];
        return segmentedText;
    }

    public String TokenizerNonBreaker(String line) {
        line = line.trim();
        line = RuleBasedTokenizer.doubleSpaces.matcher(line).replaceAll(" ");
        StringBuilder sb = new StringBuilder();
        String tokenizedText = "";
        String[] words = line.split(" ");
        for (int i = 0; i < words.length; ++i) {
            String curWord;
            Matcher wordDotMatcher = wordDot.matcher(words[i]);
            if (!(!wordDotMatcher.find() || (curWord = wordDotMatcher.replaceAll("$1")).contains(".") && alphabetic.matcher(curWord).find() || curWord.matches("(" + this.NON_BREAKER + ")") || i < words.length - 1 && (startLower.matcher(words[i + 1]).find() || startPunct.matcher(words[i + 1]).find()) || curWord.matches(NON_BREAKER_DIGITS) && i < words.length - 1 && startDigit.matcher(words[i + 1]).find())) {
                words[i] = curWord + " .";
            }
            sb.append(words[i]).append(" ");
            tokenizedText = sb.toString();
        }
        return tokenizedText;
    }
}

