/*!
* lunr.tokenizer
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* A function for splitting a string into tokens ready to be inserted into
* the search index. Uses `lunr.tokenizer.seperator` to split strings, change
* the value of this property to change how strings are split into tokens.
*
* This tokenizer will convert its parameter to a string by calling `toString` and
* then will split this string on the character in `lunr.tokenizer.seperator`.
* Arrays will have their elements converted to strings and wrapped in a lunr.Token.
*
* @static
* @param {?(string|object|object[])} obj - The object to convert into tokens
* @returns {lunr.Token[]}
*/
lunr.tokenizer = function (obj) {
if (obj == null || obj == undefined) return []
// TODO: ensure that this always returns instances of lunr.Token
if (Array.isArray(obj)) return obj.map(function (t) { return lunr.utils.asString(t).toLowerCase() })
var str = obj.toString().trim().toLowerCase(),
len = str.length,
tokens = []
for (var sliceEnd = 0, sliceStart = 0; sliceEnd <= len; sliceEnd++) {
var char = str.charAt(sliceEnd),
sliceLength = sliceEnd - sliceStart
if ((char.match(lunr.tokenizer.seperator) || sliceEnd == len)) {
if (sliceLength > 0) {
tokens.push(
new lunr.Token (str.slice(sliceStart, sliceEnd), {
position: [sliceStart, sliceLength],
index: tokens.length
})
)
}
sliceStart = sliceEnd + 1
}
}
return tokens
}
/**
* The sperator used to split a string into tokens. Override this property to change the behaviour of
* `lunr.tokenizer` behaviour when tokenizing strings. By default this splits on whitespace and hyphens.
*
* @static
* @see lunr.tokenizer
*/
lunr.tokenizer.seperator = /[\s\-]+/