package org.hansken.plugin.extraction.hql_lite.lang.human;

import static java.lang.Character.highSurrogate;
import static java.lang.Character.isBmpCodePoint;
import static java.lang.Character.isDefined;
import static java.lang.Character.isLetterOrDigit;
import static java.lang.Character.lowSurrogate;
import static java.lang.Integer.toHexString;

import static org.apache.commons.lang3.text.translate.EntityArrays.invert;
import static org.hansken.plugin.extraction.util.ArgChecks.argNotNull;

import java.io.IOException;
import java.io.Writer;

import org.apache.commons.lang3.text.translate.AggregateTranslator;
import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
import org.apache.commons.lang3.text.translate.JavaUnicodeEscaper;
import org.apache.commons.lang3.text.translate.LookupTranslator;
import org.apache.commons.lang3.text.translate.UnicodeUnescaper;

/**
 * HQL-human string utilities.
 *
 * @author Netherlands Forensic Institute
 */
public final class HqlHumanUtil {

    /**
     * Non-alphanumeric characters that can occur within a term.
     * They are escaped only when they are a leading character of a term.
     * Note that : is omitted here as it always needs escaping.
     */
    private static final String GLUE_CHARS = "@.,-_";

    /**
     * Characters that should never be escaped.
     * # is leading annotation character for #note, no escape needed
     */
    private static final String NO_ESCAPE = "#";

    /**
     * Escapes single/double quote, backslash and whitespace.
     */
    private static final String[][] LOOKUPS = new String[][]{
        {"'", "\\'"},
        {"\\", "\\\\"},
        {"\b", "\\b"},
        {"\f", "\\f"},
        {"\n", "\\n"},
        {"\r", "\\r"},
        {"\t", "\\t"},
    };

    private static final CharSequenceTranslator ESCAPE =
        new AggregateTranslator(new LookupTranslator(LOOKUPS),
            // escape wildcard characters found in literals
            new LookupTranslator(new String[][]{{"*", "\\*"}, {"?", "\\?"}}),
            // escape control characters and surrogates, but not other code points
            JavaUnicodeEscaper.below(0x20),
            JavaUnicodeEscaper.between(0x7f, 0xbf),
            JavaUnicodeEscaper.above(0xd800));

    private static final CharSequenceTranslator ESCAPE_TERM =
        new AggregateTranslator(new LookupTranslator(LOOKUPS),
            // escape wildcard characters found in literals, always escape :, never escape #
            new LookupTranslator(new String[][]{{"*", "\\*"}, {"?", "\\?"}, {":", "\\:"}}),
            // escape control characters and surrogates, but not other code points
            JavaUnicodeEscaper.below(0x20),
            JavaUnicodeEscaper.between(0x7f, 0xbf),
            JavaUnicodeEscaper.above(0xd800),
            new TermEscaper());

    private static final CharSequenceTranslator UNESCAPE =
        //unescape unicode, lookups, remove unneeded single \'s
        new AggregateTranslator(new UnicodeCodepointUnescaper(),
            new UnicodeUnescaper(),
            new LookupTranslator(invert(LOOKUPS)),
            new LookupTranslator(new String[][]{{"\\", ""}}));

    private static final CharSequenceTranslator UNESCAPE_KEEP_WILDCARDS =
        //same as standard unenscape, but do not unescape escaped wildcards
        //as they're escaped by the term query handler to distinguish wildcards from literals
        new AggregateTranslator(new LookupTranslator(new String[][]{
            {"\\*", "\\*"},
            {"\\?", "\\?"},
            {"\\\\", "\\\\"}
        }), UNESCAPE);

    private HqlHumanUtil() {
    }

    /**
     * Escapes full, untokenized text so it can be used within a single quoted string literal.
     * Escapes:
     * - backslashes
     * - single quotes
     * - whitespace
     *
     * @param text the text to escape
     * @return text
     */
    public static String escapeText(final String text) {
        argNotNull("text", text);
        return ESCAPE.translate(text);
    }

    /**
     * Escapes text so it can be used as a term.
     * Escapes:
     * - backslashes
     * - single quotes
     * - whitespace
     * - any non-alphanumertic characters that are not glue characters.
     *
     * @param text the text to escape
     * @return text
     */
    public static String escapeTerm(final String text) {
        argNotNull("text", text);

        final String escaped = ESCAPE_TERM.translate(text);

        // escape the first character in case it is a glue character
        // as it may have a special meaning in hql-human, like -
        final boolean escapeFirst = text.length() > 0 && GLUE_CHARS.indexOf(text.charAt(0)) >= 0 && escaped.charAt(0) != '\\';
        return escapeFirst ? '\\' + escaped : escaped;
    }

    /**
     * Unescapes escaped characters, including wildcards, so it can be used within a query or for suggestions.
     *
     * @param text the text to unescape
     * @return text
     */
    public static String unescape(final String text) {
        argNotNull("text", text);
        return UNESCAPE.translate(text);
    }

    /**
     * Unescapes escaped characters, except wildcards, so it can be used within a term query.
     *
     * @param text the text to unescape
     * @return text
     */
    public static String unescapeKeepWildcards(final String text) {
        argNotNull("text", text);
        return UNESCAPE_KEEP_WILDCARDS.translate(text);
    }

    /**
     * Returns whether text contains only plain characters that need no escaping.
     *
     * @param text the text to test
     * @return whether text contains only plain characters that need no escaping.
     */
    public static boolean isPlain(final String text) {
        //needs no escaping but does need to be quoted
        //since hql-human parsing would fail otherwise
        for (int i = 0; i < text.length(); i++) {
            final char c = text.charAt(i);

            //glue characters can occur within a term, but not at the start or end
            if (i > 0 && i < text.length() - 1 && GLUE_CHARS.indexOf(c) >= 0) {
                continue;
            }
            //the #-character can occur in field names, and needs no escaping
            if (c == '#') {
                continue;
            }

            //otherwise, the character must be alphanumeric
            if (!isLetterOrDigit(c)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Unicode Codepoint Unescaper.
     * Parses unicode escape seuqences formatted as \\u{codepoint}.
     * Where codepoint is a hexadecimal number indicating the codepoint.
     * See https://ecma-international.org/ecma-262/6.0/#sec-literals-string-literals
     */
    public static class UnicodeCodepointUnescaper extends CharSequenceTranslator {
        @Override
        public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
            int pos = index;
            final int length = input.length();

            // bail out if input does not start with \\u
            if (pos + 2 >= length || input.charAt(pos) != '\\' || input.charAt(pos + 1) != 'u') {
                return 0;
            }
            pos += 2;

            // consume required '{' char
            if (pos < input.length() && input.charAt(pos) == '{') {
                pos++;

                // consume digits
                final int start = pos;
                while (pos < length && isHex(input.charAt(pos))) {
                    pos++;
                }
                final int end = pos;
                if (pos < length && input.charAt(pos) == '}') {
                    pos++;
                    if (end > start) {
                        final CharSequence unicode = input.subSequence(start, end);
                        try {
                            final int codePoint = Integer.parseInt(unicode.toString(), 16);
                            writeCodePoint(out, codePoint);
                            return pos - index;
                        }
                        catch (final NumberFormatException nfe) {
                            // ignored, we will throw IAE further on
                        }
                    }
                }

                throw new IllegalArgumentException("bad unicode codepoint escape sequence: " + input.subSequence(index, pos));
            }
            return 0;
        }

        private static boolean isHex(final char c) {
            return c >= '0' && c <= '9' ||
                c >= 'a' && c <= 'f' ||
                c >= 'A' && c <= 'F';
        }

        private static void writeCodePoint(final Writer writer, final int codePoint) throws IOException {
            if (!isDefined(codePoint)) {
                throw new IllegalArgumentException("undefined unicode codepoint: \\u{" + toHexString(codePoint) + "}");
            }
            if (isBmpCodePoint(codePoint)) {
                writer.write((char) codePoint);
            }
            else {
                // write surrogates
                writer.write(highSurrogate(codePoint));
                writer.write(lowSurrogate(codePoint));
            }
        }
    }

    /**
     * Escapes any non-letter or non-digit characters,
     * except glue characters or characters that never need escaping.
     *
     * @author Netherlands Forensic Institute
     */
    public static class TermEscaper extends CharSequenceTranslator {
        @Override
        public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
            final char c = input.charAt(index);
            if (!isLetterOrDigit(c) && GLUE_CHARS.indexOf(c) < 0 && NO_ESCAPE.indexOf(c) < 0) {
                out.write('\\');
            }
            out.write(c);
            return 1;
        }
    }
}
