001package org.nasdanika.ai;
002
003import java.util.Collections;
004import java.util.Map;
005import java.util.TreeMap;
006import java.util.function.Function;
007import java.util.stream.Collectors;
008import java.util.stream.Stream;
009
010import reactor.core.publisher.Mono;
011
012/**
013 * Splits input by whitespace, lowercases and then computes frequency of each word
014 */
015public class BagOfWordsGenerator implements TextEmbeddingGenerator<Map<String,Integer>> {
016
017        @Override
018        public Mono<Map<String, Integer>> generateAsync(String source) {
019                if (source == null || source.trim().length() == 0) {
020                        return Mono.just(Collections.emptyMap());
021                }
022                Map<String, Integer> result = new TreeMap<>();
023                Stream
024                        .of(source.split("\\s+"))
025                        .map(String::toLowerCase)
026                        .collect(Collectors.groupingBy(Function.identity()))
027                        .entrySet()
028                        .forEach(e -> result.put(e.getKey(), e.getValue().size()));
029                return Mono.just(result);
030        }
031
032}