/*
 * Decompiled with CFR 0.152.
 */
package com.s24.search.solr.analyzers;

import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.io.CharStreams;
import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeFactory;

public class AnalyzingSentenceTokenizer
extends Tokenizer {
    private static final Pattern SENTENCE_PATTERN = Pattern.compile("(?<=[.?!\\|;-])\\s+(?=\\p{Lu})");
    private static final Splitter SPACE_SPLITTER = Splitter.on((CharMatcher)CharMatcher.WHITESPACE).trimResults();
    private static final CharMatcher SENTENCE_NOISE = CharMatcher.DIGIT.or(CharMatcher.anyOf((CharSequence)",;.:$!?%&/<>\u2122\u00ae\\-\u2013'\"|"));
    private static final Pattern COMMA_PATTERN = Pattern.compile("(,+(?=\\D))|((?<=\\D),+)|;");
    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
    private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);
    private final PositionIncrementAttribute positionIncrement = (PositionIncrementAttribute)this.addAttribute(PositionIncrementAttribute.class);
    private final StringBuilder inputBuffer = new StringBuilder();
    private final Matcher sentenceMatcher;
    private int index;
    private boolean lastSentenceFromCommaSplit = false;
    private final boolean removeBadSentences;
    private final CharArraySet stopWords;
    private final float commaWordThreshold;
    private final float maxStopwordRatio;
    private final int minSentenceLength;

    public AnalyzingSentenceTokenizer(AttributeFactory factory, boolean removeBadSentences, CharArraySet stopWords, float commaWordThreshold, float maxStopwordRatio, int minSentenceLength) {
        super(factory);
        this.removeBadSentences = removeBadSentences;
        this.stopWords = stopWords;
        this.commaWordThreshold = commaWordThreshold;
        this.maxStopwordRatio = maxStopwordRatio;
        this.minSentenceLength = minSentenceLength;
        this.sentenceMatcher = SENTENCE_PATTERN.matcher("");
    }

    public void end() throws IOException {
        super.end();
        int ofs = this.correctOffset(this.inputBuffer.length());
        this.offsetAtt.setOffset(ofs, ofs);
    }

    public void reset() throws IOException {
        super.reset();
        this.inputBuffer.setLength(0);
        this.inputBuffer.append(CharStreams.toString((Readable)this.input));
        this.sentenceMatcher.reset(this.inputBuffer);
        this.index = 0;
    }

    public final boolean incrementToken() throws IOException {
        while (this.index < this.inputBuffer.length()) {
            if (!this.incrementTokenInternal()) continue;
            return true;
        }
        return false;
    }

    protected boolean incrementTokenInternal() throws IOException {
        boolean emit;
        String sentence = null;
        sentence = this.sentenceMatcher.find(this.index) ? this.inputBuffer.substring(this.index, this.sentenceMatcher.end()) : this.inputBuffer.substring(this.index, this.inputBuffer.length());
        Matcher commaMatcher = COMMA_PATTERN.matcher(sentence);
        if (commaMatcher.find()) {
            int commaCount = 1;
            while (commaMatcher.find()) {
                ++commaCount;
            }
            float commaToWordRatio = (float)commaCount / (float)(CharMatcher.WHITESPACE.countIn((CharSequence)sentence) - 1);
            if (commaToWordRatio > this.commaWordThreshold || this.lastSentenceFromCommaSplit) {
                commaMatcher.reset();
                commaMatcher.find();
                sentence = sentence.substring(0, commaMatcher.end());
                this.lastSentenceFromCommaSplit = true;
            }
        } else {
            this.lastSentenceFromCommaSplit = false;
        }
        boolean isOnlySentence = sentence.length() == this.inputBuffer.length();
        boolean bl = emit = this.isQualitySentence(sentence) || isOnlySentence || !this.removeBadSentences;
        if (emit) {
            this.emitSentence(sentence);
        }
        this.index += sentence.length();
        return emit;
    }

    private boolean isQualitySentence(CharSequence sentence) {
        SentenceStatistics sentenceStatistics = this.analyzeSentence(sentence);
        boolean highInformationGain = sentenceStatistics.getStopwordsRatio() <= this.maxStopwordRatio;
        boolean shortSentence = sentenceStatistics.getWordCount() < this.minSentenceLength;
        return highInformationGain || shortSentence;
    }

    private void emitSentence(CharSequence sentence) {
        this.termAtt.setEmpty().append(sentence);
        this.offsetAtt.setOffset(this.correctOffset(this.index), this.correctOffset(this.index + sentence.length()));
        this.positionIncrement.setPositionIncrement(1);
    }

    private SentenceStatistics analyzeSentence(CharSequence sentence) {
        String cleanSentence = WHITESPACE_PATTERN.matcher(SENTENCE_NOISE.removeFrom((CharSequence)CharMatcher.WHITESPACE.trimFrom(sentence))).replaceAll(" ").toLowerCase(Locale.GERMAN);
        Iterable words = SPACE_SPLITTER.split((CharSequence)cleanSentence);
        int stopWordCount = 0;
        int wordCount = 0;
        for (String w : words) {
            if (this.stopWords.contains((CharSequence)w)) {
                ++stopWordCount;
            }
            ++wordCount;
        }
        return new SentenceStatistics(wordCount, stopWordCount);
    }

    private static class SentenceStatistics {
        private final int wordCount;
        private final int stopwordCount;

        public SentenceStatistics(int wordCount, int stopwordCount) {
            this.wordCount = wordCount;
            this.stopwordCount = stopwordCount;
        }

        public int getWordCount() {
            return this.wordCount;
        }

        public float getStopwordsRatio() {
            return this.wordCount > 0 ? (float)this.stopwordCount / (float)this.wordCount : 0.0f;
        }
    }
}

