Source: index.js

/*!
 * lunr.Index
 * Copyright (C) @YEAR Oliver Nightingale
 */

/**
 * An index contains the built index of all documents and provides a query interface
 * to the index.
 *
 * Usually instances of lunr.Index will not be created using this constructor, instead
 * lunr.Builder should be used to construct new indexes, or lunr.Index.load should be
 * used to load previously built and serialized indexes.
 *
 * @constructor
 * @param {Object} attrs - The attributes of the built search index.
 * @param {Object} attrs.invertedIndex - An index of term/field to document reference.
 * @param {Object<string, lunr.Vector>} attrs.documentVectors - Document vectors keyed by document reference.
 * @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens.
 * @param {number} attrs.documentCount - The total number of documents held in the index.
 * @param {number} attrs.averageDocumentLength - The average length of all documents in the index.
 * @param {number} attrs.b - A parameter for the document scoring algorithm.
 * @param {number} attrs.k1 - A parameter for the document scoring algorithm.
 * @param {string[]} attrs.fields - The names of indexed document fields.
 */
lunr.Index = function (attrs) {
  this.invertedIndex = attrs.invertedIndex
  this.documentVectors = attrs.documentVectors
  this.tokenSet = attrs.tokenSet
  this.documentCount = attrs.documentCount
  this.averageDocumentLength = attrs.averageDocumentLength
  this.b = attrs.b
  this.k1 = attrs.k1
  this.fields = attrs.fields
}

/**
 * A result contains details of a document matching a search query.
 * @typedef {Object} lunr.Index~Result
 * @property {string} ref - The reference of the document this result represents.
 * @property {number} score - A number between 0 and 1 representing how similar this document is to the query.
 * @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match.
 */

/**
 * Although lunr provides the ability to create queries using lunr.Query, it also provides a simple
 * query language which itself is parsed into an instance of lunr.Query.
 *
 * For programatically building queries it is advised to directyl use lunr.Query, the query language
 * is best used for human entered text rather than program generated text.
 *
 * At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported
 * and will be combined with OR, e.g `hello world` will match documents that contain either 'hello'
 * or 'world', though those that contain both will rank higher in the results.
 *
 * Wildcards can be included in terms to match one or more unspecified characters, these wildcards can
 * be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding
 * wildcards will increase the number of documents that will be found but can also have a negative
 * impact on query performance, especially with wildcards at the begining of a term.
 *
 * Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term
 * hello in the title field will match this query. Using a field not present in the index will lead
 * to an error being thrown.
 *
 * Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term
 * boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported
 * to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2.
 * Avoid large values for edit distance to improve query performance.
 *
 * @typedef {string} lunr.Index~QueryString
 * @example <caption>Simple single term query</caption>
 * hello
 * @example <caption>Multiple term query</caption>
 * hello world
 * @example <caption>term scoped to a field</caption>
 * title:hello
 * @example <caption>term with a boost of 10</caption>
 * hello^10
 * @example <caption>term with an edit distance of 2</caption>
 * hello~2
 */

/**
 * Performs a search against the index using lunr query syntax.
 *
 * Results will be returned sorted by their score, the most relevant results
 * will be returned first.
 *
 * For more programatic querying use lunr.Index#query.
 *
 * @param {lunr.Index~QueryString} queryString - A string containing a lunr query.
 * @throws {lunr.QueryParseError} If the passed query string cannot be parsed.
 * @returns {lunr.Index~Result[]}
 */
lunr.Index.prototype.search = function (queryString) {
  return this.query(function (query) {
    var parser = new lunr.QueryParser(queryString, query)
    parser.parse()
  })
}

/**
 * A query builder callback provides a query object to be used to express
 * the query to perform on the index.
 *
 * @callback lunr.Index~queryBuilder
 * @param {lunr.Query} query - The query object to build up.
 * @this lunr.Query
 */

/**
 * Performs a query against the index using the yielded lunr.Query object.
 *
 * If performing programtic queries against the index this method is prefered
 * over lunr.Index#search so as to avoid the additional query parsing overhead.
 *
 * A query object is yielded to the supplied function which should be used to
 * express the query to be run against the index.
 *
 * Note that although this function takes a callback parameter it is _not_ an
 * asynchronous operation, the callback is just yielded a query object to be
 * customized.
 *
 * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query.
 * @returns {lunr.Index~Result[]}
 */
lunr.Index.prototype.query = function (fn) {
  // for each query clause
  // * process terms
  // * expand terms from token set
  // * find matching documents and metadata
  // * get document vectors
  // * score documents

  var query = new lunr.Query(this.fields),
      matchingDocuments = {},
      queryVector = new lunr.Vector

  fn.call(query, query)

  // TODO: need to potentially pass the token through the pipeline
  for (var i = 0; i < query.clauses.length; i++) {
    /*
     * Get a list of matching terms from this query clause. The query
     * clause is used to build a token set, which is then intersected
     * against the indexes token set to get a list of terms to lookup
     * in the inverted index.
     */
    var clause = query.clauses[i],
        termTokenSet = lunr.TokenSet.fromClause(clause),
        expandedTerms = this.tokenSet.intersect(termTokenSet).toArray()

    for (var j = 0; j < expandedTerms.length; j++) {
      /*
       * For each term calculate the score as the term relates to the
       * query using the same calculation used to score documents during
       * indexing. This score will be used to build a vector space
       * representation  of the query.
       *
       * Also need to discover the terms index to insert into the query
       * vector at the right position
       */
      var expandedTerm = expandedTerms[j],
          posting = this.invertedIndex[expandedTerm],
          termIdf = this.idf(expandedTerm),
          weight = 1 / (((1 - this.b) + this.b) * (query.clauses.length / this.averageDocumentLength)),
          score = termIdf * weight / this.k1 + weight,
          termIndex = posting.index

      /*
       * Inserting the found query term, along with its term index
       * into the vector representing the query. It is here that
       * any boosts are applied to the score. They could have been
       * applied when calculating the score above, but that expression
       * is already quite busy.
       */
      queryVector.insert(termIndex, score * clause.boost)

      for (var k = 0; k < clause.fields.length; k++) {
        /*
         * For each field that this query term is scoped by (by default
         * all fields are in scope) we need to get all the document refs
         * that have this term in that field.
         *
         * The posting is the entry in the invertedIndex for the matching
         * term from above.
         */
        var field = clause.fields[k],
            fieldPosting = posting[field],
            matchingDocumentRefs = Object.keys(fieldPosting)

        for (var l = 0; l < matchingDocumentRefs.length; l++) {
          /*
           * All metadata for this term/field/document triple
           * are then extracted and collected into an instance
           * of lunr.MatchData ready to be returned in the query
           * results
           */
          var matchingDocumentRef = matchingDocumentRefs[l],
              documentMetadata, matchData

          /*
           * The IDF for the term/field tuple is also stored
           * in the posting, this is not a document and so is
           * ignored at this step. This does mean that 'idf' is
           * not allowed as a document ref.
           */
          if (matchingDocumentRef == "idf") {
            continue
          }

          documentMetadata = fieldPosting[matchingDocumentRef]
          matchData = new lunr.MatchData (expandedTerm, field, documentMetadata)

          if (matchingDocumentRef in matchingDocuments) {
            matchingDocuments[matchingDocumentRef].combine(matchData)
          } else {
            matchingDocuments[matchingDocumentRef] = matchData
          }

        }
      }
    }
  }

  var matchingDocumentRefs = Object.keys(matchingDocuments),
      results = []

  for (var i = 0; i < matchingDocumentRefs.length; i++) {
    /*
     * With all the matching documents found they now need
     * to be sorted by their relevance to the query. This
     * is done by retrieving the documents vector representation
     * and then finding its similarity with the query vector
     * that was constructed earlier.
     *
     * This score, along with the document ref and any metadata
     * we collected into a lunr.MatchData instance are stored
     * in the results array ready for returning to the caller
     */
    var ref = matchingDocumentRefs[i],
        documentVector = this.documentVectors[ref],
        score = queryVector.similarity(documentVector)

    results.push({
      ref: ref,
      score: score,
      matchData: matchingDocuments[ref]
    })
  }

  return results.sort(function (a, b) {
    return b.score - a.score
  })
}

// TODO: this is copied from the builder
// * store the idf for the ALL field right on the posting it self
// this will allow us to get rid of this
lunr.Index.prototype.idf = function (term) {
  var posting = this.invertedIndex[term],
      documentsWithTerm = 0

  for (var fieldName in posting) {
    documentsWithTerm += Object.keys(posting[fieldName]).length
  }

  return (this.documentCount - documentsWithTerm + 0.5) / (documentsWithTerm + 0.5)
}

/**
 * Prepares the index for JSON serialization.
 *
 * The schema for this JSON blob will be described in a
 * seperate JSON schema file.
 *
 * @returns {Object}
 */
lunr.Index.prototype.toJSON = function () {
  var invertedIndex = Object.keys(this.invertedIndex)
    .sort()
    .map(function (term) {
      return [term, this.invertedIndex[term]]
    }, this)

  var documentVectors = Object.keys(this.documentVectors)
    .map(function (ref) {
      return [ref, this.documentVectors[ref].toJSON()]
    }, this)

  // TODO: include the version of lunr.
  return {
    averageDocumentLength: this.averageDocumentLength,
    b: this.b,
    k1: this.k1,
    fields: this.fields,
    documentVectors: documentVectors,
    invertedIndex: invertedIndex
  }
}

/**
 * Loads a previously serialized lunr.Index
 *
 * @param {Object} serializedIndex - A previously serialized lunr.Index
 * @returns {lunr.Index}
 */
lunr.Index.load = function (serializedIndex) {
  var attrs = {},
      documentVectors = {},
      serializedVectors = serializedIndex.documentVectors,
      documentCount = 0,
      invertedIndex = {},
      serializedInvertedIndex = serializedIndex.invertedIndex,
      tokenSetBuilder = new lunr.TokenSet.Builder

  // TODO check the version of lunr.

  for (var i = 0; i < serializedVectors.length; i++, documentCount++) {
    var tuple = serializedVectors[i],
        ref = tuple[0],
        elements = tuple[1]

    documentVectors[ref] = new lunr.Vector(elements)
  }

  for (var i = 0; i < serializedInvertedIndex.length; i++) {
    var tuple = serializedInvertedIndex[i],
        term = tuple[0],
        posting = tuple[1]

    tokenSetBuilder.insert(term)
    invertedIndex[term] = posting
  }

  tokenSetBuilder.finish()

  attrs.b = serializedIndex.b
  attrs.k1 = serializedIndex.k1
  attrs.fields = serializedIndex.fields
  attrs.averageDocumentLength = serializedIndex.averageDocumentLength

  attrs.documentCount = documentCount
  attrs.documentVectors = documentVectors
  attrs.invertedIndex = invertedIndex
  attrs.tokenSet = tokenSetBuilder.root

  return new lunr.Index(attrs)
}