package ai.passio.passiosdk.passiofood.voice.local.tokenization


/**
 * A java realization of Bert tokenization. Original python code:
 * https://github.com/google-research/bert/blob/master/tokenization.py runs full tokenization to
 * tokenize a String into split subtokens or ids.
 */
internal class FullTokenizer(private val dic: Map<String, Int>, doLowerCase: Boolean) {
    private val basicTokenizer: BasicTokenizer
    private val wordpieceTokenizer: WordpieceTokenizer

    init {
        basicTokenizer = BasicTokenizer(doLowerCase)
        wordpieceTokenizer = WordpieceTokenizer(dic)
    }

    fun tokenize(text: String?): List<String> {
        val splitTokens: MutableList<String> = ArrayList()
        for (token in basicTokenizer.tokenize(text)) {
            splitTokens.addAll(wordpieceTokenizer.tokenize(token))
        }
        return splitTokens
    }

    fun convertTokensToIds(tokens: List<String>): MutableList<Int> {
        val outputIds: MutableList<Int> = ArrayList()
        for (token in tokens) {
            outputIds.add(dic[token]!!)
        }
        return outputIds
    }
}