summaryrefslogtreecommitdiffstats
path: root/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm
blob: 639c92b6e4af9ef62d8160058909010b8c648256 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

"use strict";

// We load this into a worker using importScripts, and in tests using import.
// We use var to avoid name collision errors.
// eslint-disable-next-line no-var
var EXPORTED_SYMBOLS = ["NmfTextTagger"];

const NmfTextTagger = class NmfTextTagger {
  constructor(model, toksToTfIdfVector) {
    this.model = model;
    this.toksToTfIdfVector = toksToTfIdfVector;
  }

  /**
   * A multiclass classifier that scores tokenized text against several classes through
   * inference of a nonnegative matrix factorization of TF-IDF vectors and
   * class labels. Returns a map of class labels as string keys to scores.
   * (Higher is more confident.) All classes get scored, so it is up to
   * consumer of this data determine what classes are most valuable.
   */
  tagTokens(tokens) {
    let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs);
    let fve = Object.values(fv);

    // normalize by the sum of the vector
    let sum = 0.0;
    for (let pair of fve) {
      // eslint-disable-next-line prefer-destructuring
      sum += pair[1];
    }
    for (let i = 0; i < fve.length; i++) {
      // eslint-disable-next-line prefer-destructuring
      fve[i][1] /= sum;
    }

    // dot the document with each topic vector so that we can transform it into
    // the latent space
    let toksInLatentSpace = [];
    for (let topicVect of this.model.topic_word) {
      let fvDotTwv = 0;
      // dot fv with each topic word vector
      for (let pair of fve) {
        let [termId, tfidf] = pair;
        fvDotTwv += tfidf * topicVect[termId];
      }
      toksInLatentSpace.push(fvDotTwv);
    }

    // now project toksInLatentSpace back into class space
    let predictions = {};
    Object.keys(this.model.document_topic).forEach(topic => {
      let score = 0;
      for (let i = 0; i < toksInLatentSpace.length; i++) {
        score += toksInLatentSpace[i] * this.model.document_topic[topic][i];
      }
      predictions[topic] = score;
    });

    return predictions;
  }
};