/*
 * Decompiled with CFR 0.152.
 */
package eus.ixa.ixa.pipe.tok;

import eus.ixa.ixa.pipe.seg.RuleBasedSegmenter;
import eus.ixa.ixa.pipe.tok.RuleBasedTokenizer;
import eus.ixa.ixa.pipe.tok.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NonPeriodBreaker {
    public static String SECTION = "\u00a7";
    public static Pattern section = Pattern.compile(SECTION);
    public static Pattern segmentAll = Pattern.compile("([\\p{Alnum}\\.-]*" + RuleBasedSegmenter.FINAL_PUNCT + "*[\\.]+)[\\ ]*(" + RuleBasedSegmenter.INITIAL_PUNCT + "*[\\ ]*[\\p{Lu}])", 256);
    public static String NON_BREAKER_DIGITS = "(al|[Aa]rt|ca|figs?|[Nn]os?|[Nn]rs?|op|p|pp|[Pp]\u00e1g)";
    public static Pattern nonBreakerDigits = Pattern.compile("(" + NON_BREAKER_DIGITS + "[\\ ]*[\\.-]*)" + SECTION + "([\\ ]*\\p{Digit})", 256);
    public static Pattern acronym = Pattern.compile("(\\p{Lu})(\\.(\u00a7)[\\ ]*\\p{Lu})+([\\.])", 256);
    public static Pattern numbers = Pattern.compile("(\\p{Digit}+[\\.])[\\ ]*[\u00a7][\\ ]*(\\p{Digit}+)", 256);
    public static Pattern wordDot = Pattern.compile("^(\\S+)\\.$");
    public static Pattern startLower = Pattern.compile("^\\p{Lower}+", 256);
    public static Pattern startPunct = Pattern.compile("^[\\!#\\$%&\\(\\)\\*\\+,-\\/:;=>\\?@\\[\\\\\\]\\^\\{\\|\\}~]");
    public static Pattern startDigit = Pattern.compile("^\\p{Digit}+", 256);
    private String NON_BREAKER = null;

    public NonPeriodBreaker(Properties properties) {
        this.loadNonBreaker(properties);
    }

    private void loadNonBreaker(Properties properties) {
        String lang = properties.getProperty("language");
        if (this.NON_BREAKER == null) {
            this.createNonBreaker(lang);
        }
    }

    private void createNonBreaker(String lang) {
        ArrayList<String> nonBreakerList = new ArrayList<String>();
        InputStream nonBreakerInputStream = this.getNonBreakerInputStream(lang);
        if (nonBreakerInputStream == null) {
            System.err.println("ERROR: Not nonbreaker file for language " + lang + " in src/main/resources!!");
            System.exit(1);
        }
        BufferedReader breader = new BufferedReader(new InputStreamReader(nonBreakerInputStream));
        try {
            String line;
            while ((line = breader.readLine()) != null) {
                if ((line = line.trim()).startsWith("#")) continue;
                nonBreakerList.add(line);
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        this.NON_BREAKER = StringUtils.createDisjunctRegexFromList(nonBreakerList);
    }

    private final InputStream getNonBreakerInputStream(String lang) {
        InputStream nonBreakerInputStream = null;
        if (lang.equalsIgnoreCase("de")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/de-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("en")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/en-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("es")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/es-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("eu")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/eu-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("fr")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/fr-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("gl")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/gl-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("it")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/it-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("nl")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/nl-nonbreaker.txt");
        }
        return nonBreakerInputStream;
    }

    public String SegmenterNonBreaker(String line) {
        line = segmentAll.matcher(line).replaceAll("$1\u00a7$2");
        line = nonBreakerDigits.matcher(line).replaceAll("$1$3");
        Pattern nonBreaker = Pattern.compile("([\\ ](" + this.NON_BREAKER + ")[\\ ]*[\\.]*)[\\ ]*" + SECTION);
        line = nonBreaker.matcher(line).replaceAll(" $1 ");
        line = NonPeriodBreaker.deSegmentAcronyms(line);
        line = numbers.matcher(line).replaceAll("$1$2");
        line = section.matcher(line).replaceAll("\n");
        return line;
    }

    public static String deSegmentAcronyms(String line) {
        Matcher linkMatcher = acronym.matcher(line);
        StringBuffer sb = new StringBuffer();
        while (linkMatcher.find()) {
            linkMatcher.appendReplacement(sb, linkMatcher.group().replaceAll(SECTION, " "));
        }
        linkMatcher.appendTail(sb);
        line = sb.toString();
        return line;
    }

    public String TokenizerNonBreaker(String line) {
        line = line.trim();
        line = RuleBasedTokenizer.doubleSpaces.matcher(line).replaceAll(" ");
        StringBuilder sb = new StringBuilder();
        String tokenizedText = "";
        String[] words = line.split(" ");
        for (int i = 0; i < words.length; ++i) {
            String prefix;
            Matcher wordDotMatcher = wordDot.matcher(words[i]);
            if (!(!wordDotMatcher.find() || (prefix = wordDotMatcher.replaceAll("$1")).contains(".") && prefix.matches("\\p{Alpha}+") || prefix.matches("(" + this.NON_BREAKER + ")") || i < words.length - 1 && (startLower.matcher(words[i + 1]).find() || startPunct.matcher(words[i + 1]).find()) || prefix.matches(NON_BREAKER_DIGITS) && i < words.length - 1 && startDigit.matcher(words[i + 1]).find())) {
                words[i] = prefix + " .";
            }
            sb.append(words[i]).append(" ");
            tokenizedText = sb.toString();
        }
        return tokenizedText;
    }
}

