/*
 * Decompiled with CFR 0.152.
 */
package org.nasdanika.rag.core;

import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import org.eclipse.emf.ecore.EObject;
import org.nasdanika.models.pdf.Document;
import org.nasdanika.models.pdf.Paragraph;

public class PdfTextSplitter {
    private int size;
    private int overlap;
    private int tolerance;
    private Function<String, List<String>> tokenizer;

    public PdfTextSplitter(int size, int overlap, int tolerance, Function<String, List<String>> tokenizer) {
        this.size = size;
        this.overlap = overlap;
        this.tolerance = tolerance;
        this.tokenizer = tokenizer;
    }

    protected List<String> splitIntoSentences(String text) {
        BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
        iterator.setText(text);
        int start = iterator.first();
        ArrayList<String> ret = new ArrayList<String>();
        int end = iterator.next();
        while (end != -1) {
            ret.add(text.substring(start, end));
            start = end;
            end = iterator.next();
        }
        return ret;
    }

    protected List<String> splitIntoWords(String text) {
        String[] words = text.split("\\s+");
        return Arrays.asList(words);
    }

    protected String getWordSeparator() {
        return " ";
    }

    protected String getLineSeparator() {
        return System.lineSeparator();
    }

    protected String getParagraphSeparator() {
        return this.getLineSeparator() + this.getLineSeparator();
    }

    public List<Chunk> split(Document document) {
        int[] counter = new int[]{0};
        List<ParagraphRecord> paragraphs = document.getPages().stream().flatMap(p -> p.getArticles().stream()).flatMap(a -> a.getParagraphs().stream()).map(p -> {
            String text = p.getText(this.getLineSeparator(), this.getWordSeparator());
            int n = counter[0];
            counter[0] = n + 1;
            return new ParagraphRecord(n, text, this.tokenizer.apply(text).size(), (Paragraph)p, this.splitIntoSentences(text).stream().map(s -> {
                int n = counter[0];
                counter[0] = n + 1;
                return new SentenceRecord(n, (String)s, this.tokenizer.apply((String)s).size(), (Paragraph)p, this.splitIntoWords((String)s).stream().map(w -> {
                    int n = counter[0];
                    counter[0] = n + 1;
                    return new WordRecord(n, (String)w, this.tokenizer.apply((String)w), (Paragraph)p);
                }).toList());
            }).toList());
        }).toList();
        LinkedList<Chunk> chunks = new LinkedList<Chunk>();
        chunks.add(new ChunkImpl(this, paragraphs, -1, -1, -1, -1));
        for (int i = 0; i < paragraphs.size(); ++i) {
            ChunkImpl chunk = (ChunkImpl)chunks.getLast();
            ParagraphRecord paragraph = paragraphs.get(i);
            if (paragraph.size() + chunk.size() + this.tokenizer.apply(this.getParagraphSeparator()).size() < this.size) {
                chunk.add(paragraph);
                continue;
            }
            if (chunk.isFull()) {
                chunk = new ChunkImpl(this, paragraphs, i - 1, -1, -1, -1);
                chunks.add(chunk);
                if (paragraph.size() + chunk.size() < this.size) {
                    chunk.add(paragraph);
                    continue;
                }
            }
            for (int j = 0; j < paragraph.sentences().size(); ++j) {
                SentenceRecord sentence = paragraph.sentences().get(j);
                if (sentence.size() + chunk.size() < this.size) {
                    chunk.add(sentence);
                    continue;
                }
                if (chunk.isFull()) {
                    chunk = new ChunkImpl(this, paragraphs, i, j - 1, -1, -1);
                    chunks.add(chunk);
                    if (sentence.size() + chunk.size() < this.size) {
                        chunk.add(sentence);
                        continue;
                    }
                }
                int wordSeparatorSize = this.tokenizer.apply(this.getWordSeparator()).size();
                for (int k = 0; k < sentence.words().size(); ++k) {
                    WordRecord word = sentence.words().get(k);
                    if (word.tokens().size() + chunk.size() + wordSeparatorSize < this.size) {
                        chunk.add(word);
                        chunk.add(this.getWordSeparator(), (EObject)word.paragraph());
                        continue;
                    }
                    if (chunk.isFull()) {
                        chunk = new ChunkImpl(this, paragraphs, i, j, k - 1, -1);
                        chunks.add(chunk);
                        if (word.tokens().size() + chunk.size() + wordSeparatorSize < this.size) {
                            chunk.add(word);
                            chunk.add(this.getWordSeparator(), (EObject)word.paragraph());
                            continue;
                        }
                    }
                    int w = 0;
                    for (String token : word.tokens()) {
                        chunk.add(token, 1, (EObject)word.paragraph());
                        if (chunk.isFull()) {
                            chunk = new ChunkImpl(this, paragraphs, i, j, k, w);
                            chunks.add(chunk);
                            continue;
                        }
                        ++w;
                    }
                }
            }
        }
        return chunks;
    }

    /*
     * Exception performing whole class analysis.
     */
    private class ChunkImpl
    implements Chunk {
        private StringBuilder textBuilder;
        private int size;
        private List<EObject> sources;
        private Set<Integer> sourceRecords;
        private int chunkOverlap;
        final /* synthetic */ PdfTextSplitter this$0;

        /*
         * Unable to fully structure code
         */
        ChunkImpl(PdfTextSplitter var1_1, List<ParagraphRecord> paragraphs, int paragraph, int sentence, int word, int token) {
            this.this$0 = var1_1;
            super();
            this.textBuilder = new StringBuilder();
            this.sources = new ArrayList<EObject>();
            this.sourceRecords = new HashSet<Integer>();
            remaining = var1_1.overlap;
            chunks = new ArrayList<ChunkImpl>();
            block0: while (paragraph >= 0) {
                block12: {
                    block10: {
                        block11: {
                            p = paragraphs.get(paragraph);
                            if (sentence != -1) break block10;
                            if (p.size() >= remaining) break block11;
                            pChunk = new ChunkImpl(var1_1, null, -1, -1, -1, -1);
                            pChunk.add(p);
                            pChunk.add(var1_1.getParagraphSeparator(), null);
                            chunks.add(pChunk);
                            if ((remaining -= pChunk.size()) <= var1_1.tolerance) {
                                break;
                            }
                            break block12;
                        }
                        sentence = p.sentences().size() - 1;
                    }
                    while (sentence >= 0) {
                        block15: {
                            block13: {
                                block14: {
                                    s = p.sentences().get(sentence);
                                    if (word != -1) break block13;
                                    if (s.size() >= remaining) break block14;
                                    sChunk = new ChunkImpl(var1_1, null, -1, -1, -1, -1);
                                    sChunk.add(s);
                                    chunks.add(sChunk);
                                    if ((remaining -= sChunk.size()) <= var1_1.tolerance) {
                                        break block0;
                                    }
                                    break block15;
                                }
                                word = s.words().size() - 1;
                            }
                            while (word >= 0) {
                                w = s.words().get(word);
                                if (token != -1) ** GOTO lbl50
                                if (w.tokens().size() < remaining) {
                                    wChunk = new ChunkImpl(var1_1, null, -1, -1, -1, -1);
                                    wChunk.add(w);
                                    wChunk.add(var1_1.getWordSeparator(), (EObject)w.paragraph());
                                    chunks.add(wChunk);
                                    if ((remaining -= wChunk.size()) <= var1_1.tolerance) {
                                        break block0;
                                    }
                                } else {
                                    token = w.tokens().size() - 1;
lbl50:
                                    // 3 sources

                                    while (token >= 0) {
                                        tChunk = new ChunkImpl(var1_1, null, -1, -1, -1, -1);
                                        tChunk.add(w.tokens().get(token), (EObject)w.paragraph());
                                        chunks.add(tChunk);
                                        if ((remaining -= tChunk.size()) <= var1_1.tolerance) break block0;
                                        --token;
                                    }
                                }
                                --word;
                            }
                        }
                        --sentence;
                    }
                }
                --paragraph;
            }
            Collections.reverse(chunks);
            for (ChunkImpl oc : chunks) {
                this.add(oc);
            }
            this.chunkOverlap = this.size;
        }

        @Override
        public int size() {
            return this.size;
        }

        @Override
        public String getText() {
            return this.textBuilder.toString();
        }

        void add(String text, int size, EObject source) {
            this.textBuilder.append(text);
            this.size += size;
            if (this.size > this.this$0.size) {
                throw new IllegalStateException("Chunk size exceeded: " + this.size);
            }
            this.sources.add(source);
        }

        void add(String text, EObject source) {
            this.add(text, this.this$0.tokenizer.apply(text).size(), source);
        }

        void add(ParagraphRecord paragraph) {
            if (!this.sourceRecords.add(paragraph.id())) {
                throw new IllegalStateException("Duplicate source paragraph: " + paragraph);
            }
            if (this.size > 0) {
                this.add(this.this$0.getParagraphSeparator(), (EObject)paragraph.paragraph());
            }
            this.add(paragraph.text(), paragraph.size(), (EObject)paragraph.paragraph());
            this.add(this.this$0.getParagraphSeparator(), this.this$0.tokenizer.apply(this.this$0.getParagraphSeparator()).size(), (EObject)paragraph.paragraph());
        }

        void add(SentenceRecord sentence) {
            if (!this.sourceRecords.add(sentence.id())) {
                throw new IllegalStateException("Duplicate source sentence: " + sentence);
            }
            this.add(sentence.text(), sentence.size(), (EObject)sentence.paragraph());
        }

        void add(WordRecord word) {
            if (!this.sourceRecords.add(word.id())) {
                throw new IllegalStateException("Duplicate source word: " + word);
            }
            if (this.size > 0) {
                this.add(this.this$0.getWordSeparator(), null);
            }
            this.add(word.text(), word.tokens().size(), (EObject)word.paragraph());
        }

        boolean isFull() {
            return this.size > this.this$0.size - this.this$0.tolerance;
        }

        void add(ChunkImpl chunk) {
            this.add(chunk.getText(), chunk.size(), null);
            this.sources.addAll(chunk.getSources());
            for (Integer id : chunk.sourceRecords) {
                if (this.sourceRecords.add(id)) continue;
                throw new IllegalStateException("Duplicate source record in chunk: " + chunk);
            }
        }

        @Override
        public List<EObject> getSources() {
            return this.sources.stream().filter(Objects::nonNull).distinct().toList();
        }

        @Override
        public int overlap() {
            return this.chunkOverlap;
        }
    }

    private record ParagraphRecord(int id, String text, int size, Paragraph paragraph, List<SentenceRecord> sentences) {
    }

    private record SentenceRecord(int id, String text, int size, Paragraph paragraph, List<WordRecord> words) {
    }

    private record WordRecord(int id, String text, List<String> tokens, Paragraph paragraph) {
    }

    public static interface Chunk {
        public String getText();

        public List<EObject> getSources();

        public int size();

        public int overlap();
    }
}

