/*
 * Decompiled with CFR 0.152.
 */
package eus.ixa.ixa.pipe.seg;

import eus.ixa.ixa.pipe.seg.SentenceSegmenter;
import eus.ixa.ixa.pipe.tok.NonPeriodBreaker;
import eus.ixa.ixa.pipe.tok.RuleBasedTokenizer;
import java.util.Properties;
import java.util.regex.Pattern;

public class RuleBasedSegmenter
implements SentenceSegmenter {
    public static final String LINE_BREAK = "<JAR>";
    public static final String PARAGRAPH = "\u00b6\u00b6";
    public static Pattern lineBreak = Pattern.compile("<JAR>");
    public static Pattern doubleLineBreak = Pattern.compile("(<JAR><JAR>)");
    public static Pattern paragraph = Pattern.compile("(\u00b6\u00b6)");
    public static String INITIAL_PUNCT = "[#'\"\\\u00bf\\\u00a1\u00ab<\u0091\u0093\u201b\u201c\u201f\u2018\u2039]";
    public static String FINAL_PUNCT = "['\"\\)\\]\\%\u00bb=\u0092\u0094\u201d\u203a\u2019]";
    public static Pattern endPunctLinkPara = Pattern.compile("([?!\\.])[\\ ]*(\u00b6\u00b6)+[\\ ]*(http|www|ftp)");
    public static Pattern conventionalPara = Pattern.compile("([?!\\.])[\\ ]*(\u00b6\u00b6)+[\\ ]*(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern endInsideQuotesPara = Pattern.compile("([?!\\.](\u00b6)*" + FINAL_PUNCT + "+)(\u00b6\u00b6)+(" + INITIAL_PUNCT + "*(\u00b6\u00b6)*[\\p{Lu}])", 256);
    public static Pattern multiDotsParaStarters = Pattern.compile("(\\.[\\.]+)(\u00b6\u00b6)+(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern spuriousParagraph = Pattern.compile("(\u00b6\u00b6)+\\s*([\\p{Lower}\\!#\\$%&\\(\\)\\*\\+,-\\/:;=>\\?@\\[\\\\\\]\\^\\{\\|\\}~])", 256);
    public static Pattern alphaNumParaLowerNum = Pattern.compile("(\\p{Alnum})\\s*(\u00b6\u00b6)+\\s*([\\p{Lower}\\p{Digit}])", 256);
    public static Pattern noPeriodSpaceEnd = Pattern.compile("([?!])[\\ ]+(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern multiDotsSpaceStarters = Pattern.compile("(\\.[\\.]+)[\\ ]+(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern endInsideQuotesSpace = Pattern.compile("([?!\\.][\\ ]*" + FINAL_PUNCT + "+)[\\ ]+(" + INITIAL_PUNCT + "*[\\ ]*[\\p{Lu}])", 256);
    public static Pattern punctSpaceUpper = Pattern.compile("([?!\\.])[\\ ]+(" + INITIAL_PUNCT + "+[\\ ]*[\\p{Lu}])", 256);
    public static Pattern punctSpaceMultiPunct = Pattern.compile("([?!\\.])[\\ ]+([\\-]+[\\ ]*[\\(]*\\p{Lu})", 256);
    public static Pattern endPunctLinkSpace = Pattern.compile("([?!\\.])[\\ ]*(http|www|ftp)");
    private static Boolean DEBUG = false;
    private NonPeriodBreaker nonBreaker;
    private final String text;
    private boolean isHardParagraph = false;

    public RuleBasedSegmenter(String originalText, Properties properties) {
        String hardParagraph = properties.getProperty("hardParagraph");
        if (hardParagraph.equalsIgnoreCase("yes")) {
            this.isHardParagraph = true;
        }
        if (this.nonBreaker == null) {
            this.nonBreaker = new NonPeriodBreaker(properties);
        }
        this.text = RuleBasedSegmenter.buildText(originalText);
    }

    @Override
    public String[] segmentSentence() {
        if (DEBUG.booleanValue()) {
            System.err.println("->Build:" + this.text);
        }
        String[] sentences = this.segment(this.text);
        return sentences;
    }

    private String[] segment(String builtText) {
        String line = builtText.trim();
        line = RuleBasedTokenizer.doubleSpaces.matcher(line).replaceAll(" ");
        if (this.isHardParagraph) {
            line = paragraph.matcher(line).replaceAll("\n$1");
        } else {
            line = endPunctLinkPara.matcher(line).replaceAll("$1\n$2$3");
            line = conventionalPara.matcher(line).replaceAll("$1\n$2$3");
            line = endInsideQuotesPara.matcher(line).replaceAll("$1\n$3$4");
            line = multiDotsParaStarters.matcher(line).replaceAll("$1\n$2$3");
            line = alphaNumParaLowerNum.matcher(line).replaceAll("$1 $3");
            line = spuriousParagraph.matcher(line).replaceAll(" $2");
        }
        line = noPeriodSpaceEnd.matcher(line).replaceAll("$1\n$2");
        line = multiDotsSpaceStarters.matcher(line).replaceAll("$1\n$2");
        line = endInsideQuotesSpace.matcher(line).replaceAll("$1\n$2");
        line = punctSpaceUpper.matcher(line).replaceAll("$1\n$2");
        line = endPunctLinkSpace.matcher(line).replaceAll("$1\n$2");
        line = punctSpaceMultiPunct.matcher(line).replaceAll("$1\n$2");
        line = this.nonBreaker.SegmenterNonBreaker(line);
        String[] sentences = line.split("\n");
        return sentences;
    }

    public static String buildText(String text) {
        text = doubleLineBreak.matcher(text).replaceAll(PARAGRAPH);
        text = lineBreak.matcher(text).replaceAll(" ");
        return text;
    }
}

