/*!
* lunr.Index
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* An index contains the built index of all documents and provides a query interface
* to the index.
*
* Usually instances of lunr.Index will not be created using this constructor, instead
* lunr.Builder should be used to construct new indexes, or lunr.Index.load should be
* used to load previously built and serialized indexes.
*
* @constructor
* @param {Object} attrs - The attributes of the built search index.
* @param {Object} attrs.invertedIndex - An index of term/field to document reference.
* @param {Object<string, lunr.Vector>} attrs.documentVectors - Document vectors keyed by document reference.
* @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens.
* @param {number} attrs.documentCount - The total number of documents held in the index.
* @param {number} attrs.averageDocumentLength - The average length of all documents in the index.
* @param {number} attrs.b - A parameter for the document scoring algorithm.
* @param {number} attrs.k1 - A parameter for the document scoring algorithm.
* @param {string[]} attrs.fields - The names of indexed document fields.
*/
lunr.Index = function (attrs) {
this.invertedIndex = attrs.invertedIndex
this.documentVectors = attrs.documentVectors
this.tokenSet = attrs.tokenSet
this.documentCount = attrs.documentCount
this.averageDocumentLength = attrs.averageDocumentLength
this.b = attrs.b
this.k1 = attrs.k1
this.fields = attrs.fields
}
/**
* A result contains details of a document matching a search query.
* @typedef {Object} lunr.Index~Result
* @property {string} ref - The reference of the document this result represents.
* @property {number} score - A number between 0 and 1 representing how similar this document is to the query.
* @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match.
*/
/**
* Although lunr provides the ability to create queries using lunr.Query, it also provides a simple
* query language which itself is parsed into an instance of lunr.Query.
*
* For programatically building queries it is advised to directyl use lunr.Query, the query language
* is best used for human entered text rather than program generated text.
*
* At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported
* and will be combined with OR, e.g `hello world` will match documents that contain either 'hello'
* or 'world', though those that contain both will rank higher in the results.
*
* Wildcards can be included in terms to match one or more unspecified characters, these wildcards can
* be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding
* wildcards will increase the number of documents that will be found but can also have a negative
* impact on query performance, especially with wildcards at the begining of a term.
*
* Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term
* hello in the title field will match this query. Using a field not present in the index will lead
* to an error being thrown.
*
* Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term
* boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported
* to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2.
* Avoid large values for edit distance to improve query performance.
*
* @typedef {string} lunr.Index~QueryString
* @example <caption>Simple single term query</caption>
* hello
* @example <caption>Multiple term query</caption>
* hello world
* @example <caption>term scoped to a field</caption>
* title:hello
* @example <caption>term with a boost of 10</caption>
* hello^10
* @example <caption>term with an edit distance of 2</caption>
* hello~2
*/
/**
* Performs a search against the index using lunr query syntax.
*
* Results will be returned sorted by their score, the most relevant results
* will be returned first.
*
* For more programatic querying use lunr.Index#query.
*
* @param {lunr.Index~QueryString} queryString - A string containing a lunr query.
* @throws {lunr.QueryParseError} If the passed query string cannot be parsed.
* @returns {lunr.Index~Result[]}
*/
lunr.Index.prototype.search = function (queryString) {
return this.query(function (query) {
var parser = new lunr.QueryParser(queryString, query)
parser.parse()
})
}
/**
* A query builder callback provides a query object to be used to express
* the query to perform on the index.
*
* @callback lunr.Index~queryBuilder
* @param {lunr.Query} query - The query object to build up.
* @this lunr.Query
*/
/**
* Performs a query against the index using the yielded lunr.Query object.
*
* If performing programtic queries against the index this method is prefered
* over lunr.Index#search so as to avoid the additional query parsing overhead.
*
* A query object is yielded to the supplied function which should be used to
* express the query to be run against the index.
*
* Note that although this function takes a callback parameter it is _not_ an
* asynchronous operation, the callback is just yielded a query object to be
* customized.
*
* @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.
* @returns {lunr.Index~Result[]}
*/
lunr.Index.prototype.query = function (fn) {
// for each query clause
// * process terms
// * expand terms from token set
// * find matching documents and metadata
// * get document vectors
// * score documents
var query = new lunr.Query(this.fields),
matchingDocuments = {},
queryVector = new lunr.Vector
fn.call(query, query)
// TODO: need to potentially pass the token through the pipeline
for (var i = 0; i < query.clauses.length; i++) {
/*
* Get a list of matching terms from this query clause. The query
* clause is used to build a token set, which is then intersected
* against the indexes token set to get a list of terms to lookup
* in the inverted index.
*/
var clause = query.clauses[i],
termTokenSet = lunr.TokenSet.fromClause(clause),
expandedTerms = this.tokenSet.intersect(termTokenSet).toArray()
for (var j = 0; j < expandedTerms.length; j++) {
/*
* For each term calculate the score as the term relates to the
* query using the same calculation used to score documents during
* indexing. This score will be used to build a vector space
* representation of the query.
*
* Also need to discover the terms index to insert into the query
* vector at the right position
*/
var expandedTerm = expandedTerms[j],
posting = this.invertedIndex[expandedTerm],
termIdf = this.idf(expandedTerm),
weight = 1 / (((1 - this.b) + this.b) * (query.clauses.length / this.averageDocumentLength)),
score = termIdf * weight / this.k1 + weight,
termIndex = posting.index
/*
* Inserting the found query term, along with its term index
* into the vector representing the query. It is here that
* any boosts are applied to the score. They could have been
* applied when calculating the score above, but that expression
* is already quite busy.
*/
queryVector.insert(termIndex, score * clause.boost)
for (var k = 0; k < clause.fields.length; k++) {
/*
* For each field that this query term is scoped by (by default
* all fields are in scope) we need to get all the document refs
* that have this term in that field.
*
* The posting is the entry in the invertedIndex for the matching
* term from above.
*/
var field = clause.fields[k],
fieldPosting = posting[field],
matchingDocumentRefs = Object.keys(fieldPosting)
for (var l = 0; l < matchingDocumentRefs.length; l++) {
/*
* All metadata for this term/field/document triple
* are then extracted and collected into an instance
* of lunr.MatchData ready to be returned in the query
* results
*/
var matchingDocumentRef = matchingDocumentRefs[l],
documentMetadata, matchData
/*
* The IDF for the term/field tuple is also stored
* in the posting, this is not a document and so is
* ignored at this step. This does mean that 'idf' is
* not allowed as a document ref.
*/
if (matchingDocumentRef == "idf") {
continue
}
documentMetadata = fieldPosting[matchingDocumentRef]
matchData = new lunr.MatchData (expandedTerm, field, documentMetadata)
if (matchingDocumentRef in matchingDocuments) {
matchingDocuments[matchingDocumentRef].combine(matchData)
} else {
matchingDocuments[matchingDocumentRef] = matchData
}
}
}
}
}
var matchingDocumentRefs = Object.keys(matchingDocuments),
results = []
for (var i = 0; i < matchingDocumentRefs.length; i++) {
/*
* With all the matching documents found they now need
* to be sorted by their relevance to the query. This
* is done by retrieving the documents vector representation
* and then finding its similarity with the query vector
* that was constructed earlier.
*
* This score, along with the document ref and any metadata
* we collected into a lunr.MatchData instance are stored
* in the results array ready for returning to the caller
*/
var ref = matchingDocumentRefs[i],
documentVector = this.documentVectors[ref],
score = queryVector.similarity(documentVector)
results.push({
ref: ref,
score: score,
matchData: matchingDocuments[ref]
})
}
return results.sort(function (a, b) {
return b.score - a.score
})
}
// TODO: this is copied from the builder
// * store the idf for the ALL field right on the posting it self
// this will allow us to get rid of this
lunr.Index.prototype.idf = function (term) {
var posting = this.invertedIndex[term],
documentsWithTerm = 0
for (var fieldName in posting) {
documentsWithTerm += Object.keys(posting[fieldName]).length
}
return (this.documentCount - documentsWithTerm + 0.5) / (documentsWithTerm + 0.5)
}
/**
* Prepares the index for JSON serialization.
*
* The schema for this JSON blob will be described in a
* seperate JSON schema file.
*
* @returns {Object}
*/
lunr.Index.prototype.toJSON = function () {
var invertedIndex = Object.keys(this.invertedIndex)
.sort()
.map(function (term) {
return [term, this.invertedIndex[term]]
}, this)
var documentVectors = Object.keys(this.documentVectors)
.map(function (ref) {
return [ref, this.documentVectors[ref].toJSON()]
}, this)
// TODO: include the version of lunr.
return {
averageDocumentLength: this.averageDocumentLength,
b: this.b,
k1: this.k1,
fields: this.fields,
documentVectors: documentVectors,
invertedIndex: invertedIndex
}
}
/**
* Loads a previously serialized lunr.Index
*
* @param {Object} serializedIndex - A previously serialized lunr.Index
* @returns {lunr.Index}
*/
lunr.Index.load = function (serializedIndex) {
var attrs = {},
documentVectors = {},
serializedVectors = serializedIndex.documentVectors,
documentCount = 0,
invertedIndex = {},
serializedInvertedIndex = serializedIndex.invertedIndex,
tokenSetBuilder = new lunr.TokenSet.Builder
// TODO check the version of lunr.
for (var i = 0; i < serializedVectors.length; i++, documentCount++) {
var tuple = serializedVectors[i],
ref = tuple[0],
elements = tuple[1]
documentVectors[ref] = new lunr.Vector(elements)
}
for (var i = 0; i < serializedInvertedIndex.length; i++) {
var tuple = serializedInvertedIndex[i],
term = tuple[0],
posting = tuple[1]
tokenSetBuilder.insert(term)
invertedIndex[term] = posting
}
tokenSetBuilder.finish()
attrs.b = serializedIndex.b
attrs.k1 = serializedIndex.k1
attrs.fields = serializedIndex.fields
attrs.averageDocumentLength = serializedIndex.averageDocumentLength
attrs.documentCount = documentCount
attrs.documentVectors = documentVectors
attrs.invertedIndex = invertedIndex
attrs.tokenSet = tokenSetBuilder.root
return new lunr.Index(attrs)
}