Source: builder.js

/*!
 * lunr.Builder
 * Copyright (C) @YEAR Oliver Nightingale
 */

/**
 * lunr.Builder performs indexing on a set of documents and
 * returns instances of lunr.Index ready for querying.
 *
 * All configuration of the index is done via the builder, the
 * fields to index, the document reference, the text processing
 * pipeline and document scoring parameters are all set on the
 * builder before indexing.
 *
 * @constructor
 * @property {string} _ref - Internal reference to the document reference field.
 * @property {string[]} _fields - Internal reference to the document fields to index.
 * @property {object} invertedIndex - The inverted index maps terms to document fields.
 * @property {object} documentTermFrequencies - ???
 * @property {object} documentLengths - ???
 * @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.
 * @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.
 * @property {number} documentCount - Keeps track of the total number of documents indexed.
 * @property {number} b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.
 * @property {number} k1 - A parameter to control how quickly an increase in term frequency results in term frequency sauturation, the default value is 2.
 * @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.
 * @property {array} metadataWhitelist - ???
 */
lunr.Builder = function () {
  this._ref = "id"
  this._fields = []
  this.invertedIndex = {}
  this.documentTermFrequencies = {}
  this.documentLengths = {}
  this.tokenizer = lunr.tokenizer
  this.pipeline = new lunr.Pipeline
  this.documentCount = 0
  // TODO provide setters for these values to enforce limits.
  this.b = 0.75
  this.k1 = 2
  this.termIndex = 0
  this.metadataWhitelist = []
}

/**
 * Sets the document field used as the document reference. Every document must have this field.
 * The type of this field in the document should be a string, if it is not a string it will be
 * coerced into a string by calling toString.
 *
 * The default ref is 'id'.
 *
 * The ref should _not_ be changed during indexing, it should be set before any documents are
 * added to the index. Changing it during indexing can lead to inconsistent results.
 *
 * @param {string} ref - The name of the reference field in the document.
 */
lunr.Builder.prototype.ref = function (ref) {
  this._ref = ref
}

/**
 * Adds a field to the list of document fields that will be indexed. Every document being
 * indexed should have this field. Null values for this field in indexed documents will
 * not cause errors but will limit the chance of that document being retrieved by searches.
 *
 * All fields should be added before adding documents to the index. Adding fields after
 * a document has been indexed will have no effect on already indexed documents.
 *
 * @param {string} field - The name of a field to index in all documents.
 */
lunr.Builder.prototype.field = function (field) {
  this._fields.push(field)
}

/**
 * Adds a document to the index.
 *
 * Before adding fields to the index the index should have been fully setup, with the document
 * ref and all fields to index already having been specified.
 *
 * The document must have a field name as specified by the ref (by default this is 'id') and
 * it should have all fields defined for indexing, though null or undefined values will not
 * cause errors.
 *
 * @param {object} doc - The document to add to the index.
 */
lunr.Builder.prototype.add = function (doc) {
  var docRef = doc[this._ref],
      documentTerms = {}

  this.documentCount += 1
  this.documentTermFrequencies[docRef] = documentTerms
  this.documentLengths[docRef] = 0

  for (var i = 0; i < this._fields.length; i++) {
    var fieldName = this._fields[i],
        field = doc[fieldName],
        tokens = this.tokenizer(field),
        terms = this.pipeline.run(tokens)

    // store the length of this field for this document
    this.documentLengths[docRef] += terms.length

    // calculate term frequencies for this field
    for (var j = 0; j < terms.length; j++) {
      var term = terms[j]

      if (documentTerms[term] == undefined) {
        documentTerms[term] = 0
      }

      documentTerms[term] += 1

      // add to inverted index
      // create an initial posting if one doesn't exist
      if (this.invertedIndex[term] == undefined) {
        var posting = { "index": this.termIndex }
        this.termIndex += 1

        for (var k = 0; k < this._fields.length; k++) {
          posting[this._fields[k]] = {}
        }

        this.invertedIndex[term] = posting
      }

      // add an entry for this term/fieldName/docRef to the invertedIndex
      if (this.invertedIndex[term][fieldName][docRef] == undefined) {
        this.invertedIndex[term][fieldName][docRef] = {}
      }

      // store all whitelisted metadata about this token in the
      // inverted index
      for (var l = 0; l < this.metadataWhitelist.length; l++) {
        var metadataKey = this.metadataWhitelist[l],
            metadata = term.metadata[metadataKey]

        if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {
          this.invertedIndex[term][fieldName][docRef][metadataKey] = []
        }

        this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)
      }
    }

  }
}

/**
 * Calculates IDF for all terms in the index.
 *
 * @private
 */
lunr.Builder.prototype.calculateIDF = function () {
  var terms = Object.keys(this.invertedIndex),
      termsLength = terms.length

  for (var i = 0; i < termsLength; i++) {
    var term = terms[i],
        termFields = this.invertedIndex[term]

    for (var fieldName in termFields) {
      var field = termFields[fieldName],
          idf = this.idf(term, fieldName)

      field["idf"] = idf
    }
  }
}

/**
 * Calculates the average document length for this index
 *
 * @private
 */
lunr.Builder.prototype.calculateAverageDocumentLengths = function () {

  var documentRefs = Object.keys(this.documentLengths),
      numberOfDocuments = documentRefs.length,
      allDocumentsLength = 0

  for (var i = 0; i < numberOfDocuments; i++) {
    var documentRef = documentRefs[i]
    allDocumentsLength += this.documentLengths[documentRef]
  }

  this.averageDocumentLength = allDocumentsLength / numberOfDocuments
}

/**
 * Builds a vector space model of every document using lunr.Vector
 *
 * @private
 */
lunr.Builder.prototype.createDocumentVectors = function () {
  var documentVectors = {},
      docRefs = Object.keys(this.documentTermFrequencies),
      docRefsLength = docRefs.length

  for (var i = 0; i < docRefsLength; i++) {
    var docRef = docRefs[i],
        documentLength = this.documentLengths[docRef],
        documentVector = new lunr.Vector,
        termFrequencies = this.documentTermFrequencies[docRef],
        terms = Object.keys(termFrequencies),
        termsLength = terms.length

    for (var j = 0; j < termsLength; j++) {
      var term = terms[j],
          termFrequency = termFrequencies[term],
          termIndex = this.invertedIndex[term].index

      var weight = termFrequency / (((1 - this.b) + this.b) * (documentLength / this.averageDocumentLength)),
          score = this.idf(term) * weight / this.k1 + weight

      documentVector.insert(termIndex, score)
    }

    documentVectors[docRef] = documentVector
  }

  this.documentVectors = documentVectors
}

/**
 * Creates a token set of all tokens in the index using lunr.TokenSet
 *
 * @private
 */
lunr.Builder.prototype.createTokenSet = function () {
  this.tokenSet = lunr.TokenSet.fromArray(
    Object.keys(this.invertedIndex).sort()
  )
}

/**
 * Builds the index, creating an instance of lunr.Index.
 *
 * This completes the indexing process and should only be called
 * once all documents have been added to the index.
 *
 * @private
 * @returns {lunr.Index}
 */
lunr.Builder.prototype.build = function () {
  this.calculateIDF()
  this.calculateAverageDocumentLengths()
  this.createDocumentVectors()
  this.createTokenSet()

  return new lunr.Index({
    invertedIndex: this.invertedIndex,
    documentVectors: this.documentVectors,
    tokenSet: this.tokenSet,
    averageDocumentLength: this.averageDocumentLength,
    documentCount: this.documentCount,
    fields: this._fields,
    b: this.b,
    k1: this.k1
  })
}

lunr.Builder.prototype.idf = function (term) {
  var posting = this.invertedIndex[term],
      documentsWithTerm = 0

  for (var fieldName in posting) {
    documentsWithTerm += Object.keys(posting[fieldName]).length
  }

  return (this.documentCount - documentsWithTerm + 0.5) / (documentsWithTerm + 0.5)
}