/*
 * Decompiled with CFR 0.152.
 */
package top.aoyudi.rag.impl;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import top.aoyudi.rag.TextSplitter;
import top.aoyudi.rag.properties.Document;

public class RecursiveCharacterTextSplitter
implements TextSplitter {
    public static final int DEFAULT_CHUNK_SIZE = 1000;
    public static final int DEFAULT_CHUNK_OVERLAP = 200;
    public static final List<String> DEFAULT_SEPARATORS = List.of("\n\n", "\n", ". ", ", ", " ", "");
    private final int chunkSize;
    private final int chunkOverlap;
    private final List<String> separators;

    public RecursiveCharacterTextSplitter() {
        this(1000, 200, DEFAULT_SEPARATORS);
    }

    public RecursiveCharacterTextSplitter(int chunkSize, int chunkOverlap, List<String> separators) {
        if (chunkOverlap >= chunkSize) {
            throw new IllegalArgumentException("Chunk overlap must be less than chunk size");
        }
        this.chunkSize = chunkSize;
        this.chunkOverlap = chunkOverlap;
        this.separators = new ArrayList<String>(separators);
    }

    @Override
    public List<Document> splitDocument(Document document) {
        return this.splitTextWithMetadata(document.getContent(), document.getMetadata());
    }

    @Override
    public List<String> splitText(String text) {
        return this.splitText(text, this.chunkSize, this.chunkOverlap, this.separators);
    }

    @Override
    public List<Document> splitTextWithMetadata(String text, Map<String, Object> metadata) {
        List<String> chunks = this.splitText(text);
        return chunks.stream().map(chunk -> Document.of(chunk, metadata)).collect(Collectors.toList());
    }

    private List<String> splitText(String text, int chunkSize, int chunkOverlap, List<String> separators) {
        ArrayList<String> finalChunks = new ArrayList<String>();
        String separator = this.findSeparator(text, separators);
        if (separator.isEmpty()) {
            for (int i = 0; i < text.length(); i += chunkSize - chunkOverlap) {
                int end = Math.min(i + chunkSize, text.length());
                finalChunks.add(text.substring(i, end));
            }
            return finalChunks;
        }
        String[] splits = text.split(separator);
        ArrayList<String> goodSplits = new ArrayList<String>();
        for (String split : splits) {
            if (split.length() < chunkSize) {
                goodSplits.add(split);
                continue;
            }
            List<String> recursiveSplits = this.splitText(split, chunkSize, chunkOverlap, separators.subList(1, separators.size()));
            goodSplits.addAll(recursiveSplits);
        }
        return this.mergeSplits(goodSplits, separator);
    }

    private String findSeparator(String text, List<String> separators) {
        for (String separator : separators) {
            if (!text.contains(separator) || separator.isEmpty()) continue;
            return separator;
        }
        return separators.get(separators.size() - 1);
    }

    private List<String> mergeSplits(List<String> splits, String separator) {
        ArrayList<String> merged = new ArrayList<String>();
        StringBuilder currentChunk = new StringBuilder();
        for (String split : splits) {
            String potentialChunk;
            String separatorToAdd = splits.indexOf(split) > 0 ? separator : "";
            String string = potentialChunk = currentChunk.length() > 0 ? currentChunk + separatorToAdd + split : split;
            if (potentialChunk.length() <= this.chunkSize) {
                currentChunk.append(separatorToAdd).append(split);
                continue;
            }
            if (currentChunk.length() > 0) {
                merged.add(currentChunk.toString());
                currentChunk = new StringBuilder();
                int overlapStart = Math.max(0, split.length() - this.chunkOverlap);
                currentChunk.append(split.substring(overlapStart));
                continue;
            }
            merged.add(split);
        }
        if (currentChunk.length() > 0) {
            merged.add(currentChunk.toString());
        }
        return merged;
    }
}

