/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.dumpcheck;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.regex.Pattern;
import org.languagetool.Language;
import org.languagetool.dev.dumpcheck.Sentence;
import org.languagetool.dev.dumpcheck.SentenceSource;
import org.tukaani.xz.XZInputStream;

class CommonCrawlSentenceSource
extends SentenceSource {
    private static final int MIN_LENGTH = 15;
    private static final int MAX_LENGTH = 250;
    private final List<CommonCrawlSentence> sentences = new ArrayList<CommonCrawlSentence>();
    private final XZInputStream xzIn;
    private int tooShort = 0;
    private int tooLong = 0;
    private int empty = 0;
    private int wrongStartChar = 0;
    private int wrongEndChar = 0;
    private int count = 0;
    private int lineCount = 0;

    CommonCrawlSentenceSource(InputStream input, Language language, Pattern filter) throws IOException {
        super(language, filter);
        this.xzIn = new XZInputStream(input);
    }

    @Override
    public boolean hasNext() {
        this.fillSentences();
        return this.sentences.size() > 0;
    }

    @Override
    public Sentence next() {
        this.fillSentences();
        if (this.sentences.size() == 0) {
            throw new NoSuchElementException();
        }
        CommonCrawlSentence ccSentence = this.sentences.remove(0);
        return new Sentence(ccSentence.sentence, this.getSource(), null, null, ccSentence.articleCount);
    }

    @Override
    public String getSource() {
        return "commoncrawl";
    }

    private void fillSentences() {
        byte[] buffer = new byte[8192];
        try {
            int n;
            while (this.sentences.size() == 0 && (n = this.xzIn.read(buffer)) != -1) {
                String[] lines;
                String buf = new String(buffer, 0, n);
                for (String line : lines = buf.split("\n")) {
                    ++this.lineCount;
                    if ((line = line.trim()).isEmpty()) {
                        ++this.empty;
                        continue;
                    }
                    boolean startLower = Character.isLowerCase(line.charAt(0));
                    if (startLower) {
                        ++this.wrongStartChar;
                        continue;
                    }
                    if (line.length() < 15) {
                        ++this.tooShort;
                        continue;
                    }
                    if (line.length() > 250) {
                        ++this.tooLong;
                        continue;
                    }
                    if (line.endsWith(".") || line.endsWith("!") || line.endsWith("?") || line.endsWith(":")) {
                        this.sentences.add(new CommonCrawlSentence(line, this.count++));
                        continue;
                    }
                    ++this.wrongEndChar;
                }
            }
        }
        catch (IOException e) {
            this.printStats();
            throw new RuntimeException(e);
        }
    }

    private void printStats() {
        System.out.println("lines            : " + this.lineCount);
        System.out.println("indexed sentences: " + this.count);
        System.out.println("tooShort         : " + this.tooShort);
        System.out.println("tooLong          : " + this.tooLong);
        System.out.println("empty            : " + this.empty);
        System.out.println("wrongStartChar   : " + this.wrongStartChar);
        System.out.println("wrongEndChar     : " + this.wrongEndChar);
    }

    private static class CommonCrawlSentence {
        final String sentence;
        final int articleCount;

        CommonCrawlSentence(String sentence, int articleCount) {
            this.sentence = sentence;
            this.articleCount = articleCount;
        }
    }
}

