package org.kink_lang.kink.internal.program.lex;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

import org.kink_lang.kink.internal.function.Function4;

/**
 * A lexical analyzer of Kink programs.
 */
public class Lexer implements Function<String, List<Token>> {

    /** Token yielders. */
    private final List<TokenYielder> tokenYielders;

    /** Pattern to retrieve one token from the program text. */
    private static final Pattern PAT = Pattern.compile(
            "(?<Whitespace>([ \\r\\n]|#[^\\n]*)*)"
            + "((?<HexInt>0x_*[0-9a-f][0-9a-f_]*(?<BadHexFollower>[g-zA-Z?])?)"
            + "|(?<BinInt>0b_*[01][01_]*(?<BadBinFollower>[a-zA-Z2-9?])?)"
            + "|(?<Decimal>[0-9][0-9_]*(\\.[0-9][0-9_]*)?(?<BadDecFollower>[a-zA-Z?])?)"
            + "|(?<SimpleStr>'(''|[^'])*(?<SimpleStrCloser>')?)"
            + "|(?<RichStr>\"(?<RsBody>([^\"\\\\]+|\\\\(x\\{[0-9a-f]{1,6}\\}|.))*)(?<RSCloser>\")?)"
            + "|(?<Noun>([a-z_][a-z0-9_?]*)?[A-Z][a-zA-Z0-9_?]*)"
            + "|(?<Verb>[a-z_][a-z0-9_?]*)"
            + "|(?<PseudoVar>\\\\binding(?![a-zA-Z0-9_?]))"
            + "|(?<Mark>\\.{3}|<[-<=]?|==?|!=?|\\|\\|?|&&?|>[>=]?|//?|[-+^~%*:$.\\[\\]{}()])"
            + "|(?<Eot>)\\z"
            + "|(?<TabError>\\t)"
            + "|(?<Unrecog>))", Pattern.DOTALL);

    /** Pattern to retrieve units from a rich string body. */
    private static final Pattern RICHSTR_PAT = Pattern.compile(
            "[^\"\\\\]+|\\\\(x\\{(?<Code>[0-9a-f]{1,6})\\}|(?<Special>.))", Pattern.DOTALL);

    /**
     * Constructs a lexer.
     *
     * @param locale the locale for error message.
     */
    public Lexer(Locale locale) {
        this.tokenYielders = List.of(
            new TokenYielder("HexInt", this::yieldHexInt),
            new TokenYielder("BinInt", this::yieldBinInt),
            new TokenYielder("Decimal", this::yieldDecimal),
            new TokenYielder("SimpleStr", this::yieldSimpleStr),
            new TokenYielder("RichStr", this::yieldRichStr),
            new TokenYielder("Noun", this::yieldNoun),
            new TokenYielder("Verb", this::yieldVerb),
            new TokenYielder("PseudoVar", this::yieldMark),
            new TokenYielder("Mark", this::yieldMark),
            new TokenYielder("Eot", this::yieldEot),
            new TokenYielder("TabError", this::yieldTabError),
            new TokenYielder("Unrecog", this::yieldUnrecog)
        );
    }

    /**
     * Yields a base16 int num token.
     */
    private Token yieldHexInt(Integer start, Integer end, String fragment, Matcher matcher) {
        if (matcher.group("BadHexFollower") != null) {
            int pos = matcher.start("BadHexFollower");
            return new ErrorToken(getMsgNumCannotBeFollowedBySymLikeChar(), pos, pos);
        }
        BigInteger intNum = new BigInteger(fragment.replace("_", "").replace("x", ""), 16);
        return new NumToken(new BigDecimal(intNum), start, end);
    }

    /**
     * Yields a base2 int num token.
     */
    private Token yieldBinInt(Integer start, Integer end, String fragment, Matcher matcher) {
        if (matcher.group("BadBinFollower") != null) {
            int pos = matcher.start("BadBinFollower");
            return new ErrorToken(getMsgNumCannotBeFollowedBySymLikeChar(), pos, pos);
        }
        BigInteger intNum = new BigInteger(fragment.replace("_", "").replace("b", ""), 2);
        return new NumToken(new BigDecimal(intNum), start, end);
    }

    /**
     * Yields a base10 num token.
     */
    private Token yieldDecimal(Integer start, Integer end, String fragment, Matcher matcher) {
        if (matcher.group("BadDecFollower") != null) {
            int pos = matcher.start("BadDecFollower");
            return new ErrorToken(getMsgNumCannotBeFollowedBySymLikeChar(), pos, pos);
        }
        BigDecimal decimal = new BigDecimal(fragment.replace("_", ""));
        return new NumToken(decimal, start, end);
    }

    /**
     * Yields a simple str token.
     */
    private Token yieldSimpleStr(Integer start, Integer end, String fragment, Matcher matcher) {
        if (matcher.group("SimpleStrCloser") == null) {
            return new ErrorToken(getMsgStrNotClosed(), start, end);
        }

        String escapedContent = fragment.substring(1, fragment.length() - 1);
        String content = escapedContent.replace("''", "'");
        return new StrToken(content, start, end);
    }

    /**
     * Yields a rich str token.
     */
    private Token yieldRichStr(Integer start, Integer end, String fragment, Matcher matcher) {
        if (matcher.group("RSCloser") == null) {
            return new ErrorToken(getMsgStrNotClosed(), start, end);
        }

        StringBuilder sb = new StringBuilder();
        Matcher bm = RICHSTR_PAT.matcher(matcher.group("RsBody"));
        while (bm.find()) {
            int unitStart = start + "\"".length() + bm.start();
            int unitEnd = start + "\"".length() + bm.end();
            if (bm.group("Special") != null) {
                String decoded = decodeSpecial(bm.group("Special").charAt(0));
                if (decoded == null) {
                    return new ErrorToken(getMsgWrongSpecialChar(), unitStart, unitEnd);
                }
                sb.append(decoded);
            } else if (bm.group("Code") != null) {
                int codePoint = Integer.parseInt(bm.group("Code"), 16);
                if (codePoint > Character.MAX_CODE_POINT) {
                    return new ErrorToken(getMsgCodeOutOfBound(), unitStart, unitEnd);
                }
                sb.appendCodePoint(codePoint);
            } else {
                sb.append(bm.group(0));
            }
        }
        return new StrToken(sb.toString(), start, end);
    }

    /**
     * Decodes a special char from the char following the backslash.
     */
    private String decodeSpecial(char ch) {
        return ch == '0' ? "\u0000"
            : ch == 'a' ? "\u0007"
            : ch == 'b' ? "\b"
            : ch == 't' ? "\t"
            : ch == 'n' ? "\n"
            : ch == 'v' ? "\u000b"
            : ch == 'f' ? "\f"
            : ch == 'r' ? "\r"
            : ch == 'e' ? "\u001b"
            : ch == '"' ? "\""
            : ch == '\\' ? "\\"
            : null;
    }

    /**
     * Yields a noun token.
     */
    private Token yieldNoun(Integer start, Integer end, String fragment, Matcher matcher) {
        return new NounToken(fragment, start, end);
    }

    /**
     * Yields a verb token.
     */
    private Token yieldVerb(Integer start, Integer end, String fragment, Matcher matcher) {
        return new VerbToken(fragment, start, end);
    }

    /**
     * Yields a mark token.
     */
    private Token yieldMark(Integer start, Integer end, String fragment, Matcher matcher) {
        boolean isAfterWhitespace = ! matcher.group("Whitespace").isEmpty();
        return new MarkToken(fragment, start, end, isAfterWhitespace);
    }

    /**
     * Yields an end-of-text token.
     */
    private Token yieldEot(Integer start, Integer end, String fragment, Matcher matcher) {
        return new EotToken(end);
    }

    /**
     * Yields tab-is-not-a-whitespace error token.
     */
    private Token yieldTabError(Integer start, Integer end, String fragment, Matcher matcher) {
        // the error ends at the start pos of the tab,
        // not to be regarded as recoverrable
        int index = start;
        return new ErrorToken(getMsgTabDisallowed(), index, index);
    }

    /**
     * Yields an unrecognizable error token.
     */
    private Token yieldUnrecog(Integer start, Integer end, String fragment, Matcher matcher) {
        int index = start;
        return new ErrorToken(getMsgUnrecogChar(), index, index);
    }

    /**
     * Returns a list of tokens analyzed from the program text.
     *
     * <p>The result tokens ends with an EotToken or an ErrorToken.</p>
     *
     * @param programText the program text to be analyzed.
     * @return a list of token analyzed from the program text.
     */
    @Override
    public List<Token> apply(String programText) {
        List<Token> tokens = new ArrayList<>();
        Matcher matcher = PAT.matcher(programText);
        while (true) {
            matcher.find();
            Token token = tokenYielders.stream()
                .filter(yielder -> yielder.accepts(matcher))
                .findFirst()
                .map(yielder -> yielder.apply(matcher))
                .orElseThrow(AssertionError::new);
            tokens.add(token);
            if (token instanceof EotToken || token instanceof ErrorToken) {
                return Collections.unmodifiableList(tokens);
            }
        }
    }

    /**
     * Returns the message which states that nums cannot be directly followed by sym-like chars.
     */
    final String getMsgNumCannotBeFollowedBySymLikeChar() {
        return "num cannot be directly followed by sym-like chars";
    }

    /**
     * Returns the message which states that a tab is not a whitespace.
     */
    final String getMsgTabDisallowed() {
        return "tab cannot be used as a whitespace character";
    }

    /**
     * Returns the message which indicates unrecognizable characters.
     */
    final String getMsgUnrecogChar() {
        return "unrecognizable characters";
    }

    /**
     * Returns the message which indicates an unclosed string.
     */
    final String getMsgStrNotClosed() {
        return "str not closed";
    }

    /**
     * Returns the message which indicates a wrong special char.
     */
    final String getMsgWrongSpecialChar() {
        return "wrong special char";
    }

    /**
     * Returns the message that "\x{xxxxxx}" is out of the bound of valid codepoints.
     */
    final String getMsgCodeOutOfBound() {
        return "code must be in range U+000000..U+10ffff";
    }

    /**
     * A function from a matcher to a token
     * which is applied only when the matcher contains the specified group.
     */
    private static class TokenYielder {

        /** The group name. */
        private final String groupName;

        /** A function to yield a token. */
        private final Function4<Integer, Integer, String, Matcher, Token> fun;

        /**
         * Constructs an yielder.
         */
        TokenYielder(String groupName, Function4<Integer, Integer, String, Matcher, Token> fun) {
            this.groupName = groupName;
            this.fun = fun;
        }

        /**
         * Returns whether the matcher contains the specified group.
         */
        boolean accepts(Matcher matcher) {
            return matcher.group(groupName) != null;
        }

        /**
         * Yields a token from the matcher.
         */
        Token apply(Matcher matcher) {
            int start = matcher.start(groupName);
            int end = matcher.end(groupName);
            String fragment = matcher.group(groupName);
            return fun.apply(start, end, fragment, matcher);
        }

    }

}

// vim: et sw=4 sts=4 fdm=marker
