/*
 * Decompiled with CFR 0.152.
 */
package eus.ixa.ixa.pipe.tok;

import eus.ixa.ixa.pipe.seg.RuleBasedSegmenter;
import eus.ixa.ixa.pipe.tok.RuleBasedTokenizer;
import eus.ixa.ixa.pipe.tok.StringUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NonPeriodBreaker {
    public static Pattern nonSegmentedWords = Pattern.compile("([\\p{Alnum}\\.\\-]*)(" + RuleBasedSegmenter.FINAL_PUNCT + "*)(\\.+)$", 256);
    public static Pattern nextCandidateWord = Pattern.compile("([\\ ]*" + RuleBasedSegmenter.INITIAL_PUNCT + "*[\\ ]*[\\p{Lu}\\p{Digit}])", 256);
    public static String NON_BREAKER_DIGITS = "(al|[Aa]rt|ca|figs?|[Nn]os?|[Nn]rs?|op|p|pp|[Pp]\u00e1g)";
    public static Pattern acronym = Pattern.compile("(\\.)[\\p{Lu}\\-]+([\\.]+)$", 256);
    public static Pattern numbers = Pattern.compile("(\\p{Digit}+[\\.])[\\ ]*(\\p{Digit}+)", 256);
    public static Pattern wordDot = Pattern.compile("^(\\S+)\\.$");
    public static Pattern alphabetic = Pattern.compile("\\p{Alpha}", 256);
    public static Pattern startLower = Pattern.compile("^\\p{Lower}+", 256);
    public static Pattern startPunct = Pattern.compile("^[\\!#\\$%&\\(\\)\\*\\+,-\\/:;=>\\?@\\[\\\\\\]\\^\\{\\|\\}~]");
    public static Pattern startDigit = Pattern.compile("^\\p{Digit}+", 256);
    private String NON_BREAKER = null;

    public NonPeriodBreaker(Properties properties) {
        this.loadNonBreaker(properties);
    }

    private void loadNonBreaker(Properties properties) {
        if (this.NON_BREAKER == null) {
            String lang = properties.getProperty("language");
            String resourcesDirectory = properties.getProperty("resourcesDirectory");
            this.createNonBreaker(lang, resourcesDirectory);
        }
    }

    private void createNonBreaker(String lang, String resourcesDirectory) {
        InputStream nonBreakerInputStream;
        ArrayList<String> nonBreakerList = new ArrayList<String>();
        InputStream inputStream = nonBreakerInputStream = resourcesDirectory == null ? this.getNonBreakerInputStream(lang) : this.getNonBreakerInputStreamFromDirectory(lang, resourcesDirectory);
        if (nonBreakerInputStream == null) {
            String resourcesLocation = resourcesDirectory == null ? "src/main/resources" : resourcesDirectory;
            System.err.println("ERROR: Not nonbreaker file for language " + lang + " in " + resourcesLocation + "!!");
            System.exit(1);
        }
        BufferedReader breader = new BufferedReader(new InputStreamReader(nonBreakerInputStream));
        try {
            String line;
            while ((line = breader.readLine()) != null) {
                if ((line = line.trim()).startsWith("#")) continue;
                nonBreakerList.add(line);
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        this.NON_BREAKER = StringUtils.createDisjunctRegexFromList(nonBreakerList);
    }

    private final InputStream getNonBreakerInputStreamFromDirectory(String lang, String resourcesDirectory) {
        try {
            return new FileInputStream(new File(resourcesDirectory, lang.toLowerCase() + "-nonbreaker.txt"));
        }
        catch (FileNotFoundException ex) {
            return null;
        }
    }

    private final InputStream getNonBreakerInputStream(String lang) {
        InputStream nonBreakerInputStream = null;
        if (lang.equalsIgnoreCase("de")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/de-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("en")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/en-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("es")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/es-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("eu")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/eu-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("fr")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/fr-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("gl")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/gl-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("it")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/it-nonbreaker.txt");
        } else if (lang.equalsIgnoreCase("nl")) {
            nonBreakerInputStream = this.getClass().getResourceAsStream("/nl-nonbreaker.txt");
        }
        return nonBreakerInputStream;
    }

    public String SegmenterNonBreaker(String line) {
        int i;
        line = line.trim();
        line = RuleBasedTokenizer.doubleSpaces.matcher(line).replaceAll(" ");
        StringBuilder sb = new StringBuilder();
        String segmentedText = "";
        String[] words = line.split(" ");
        for (i = 0; i < words.length - 1; ++i) {
            Matcher nonSegmentedWordMatcher = nonSegmentedWords.matcher(words[i]);
            if (nonSegmentedWordMatcher.find()) {
                String curWord = nonSegmentedWordMatcher.replaceAll("$1");
                String finalPunct = nonSegmentedWordMatcher.replaceAll("$2");
                if (!(!curWord.isEmpty() && curWord.matches("(" + this.NON_BREAKER + ")") && finalPunct.isEmpty() || acronym.matcher(words[i]).find() || !nextCandidateWord.matcher(words[i + 1]).find() || !curWord.isEmpty() && curWord.matches(NON_BREAKER_DIGITS) && finalPunct.isEmpty() && startDigit.matcher(words[i + 1]).find())) {
                    words[i] = words[i] + "\n";
                }
            }
            sb.append(words[i]).append(" ");
            segmentedText = sb.toString();
        }
        segmentedText = segmentedText + words[i];
        return segmentedText;
    }

    public String TokenizerNonBreaker(String line) {
        line = line.trim();
        line = RuleBasedTokenizer.doubleSpaces.matcher(line).replaceAll(" ");
        StringBuilder sb = new StringBuilder();
        String tokenizedText = "";
        String[] words = line.split(" ");
        for (int i = 0; i < words.length; ++i) {
            String curWord;
            Matcher wordDotMatcher = wordDot.matcher(words[i]);
            if (!(!wordDotMatcher.find() || (curWord = wordDotMatcher.replaceAll("$1")).contains(".") && alphabetic.matcher(curWord).find() || curWord.matches("(" + this.NON_BREAKER + ")") || i < words.length - 1 && (startLower.matcher(words[i + 1]).find() || startPunct.matcher(words[i + 1]).find()) || curWord.matches(NON_BREAKER_DIGITS) && i < words.length - 1 && startDigit.matcher(words[i + 1]).find())) {
                words[i] = curWord + " .";
            }
            sb.append(words[i]).append(" ");
            tokenizedText = sb.toString();
        }
        return tokenizedText;
    }
}

