7 files changed, 1984 insertions, 0 deletions
diff --git a/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm b/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm
new file mode 100644
index 0000000000..cc625076ba
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm
@@ -0,0 +1,67 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["NaiveBayesTextTagger"];
+
+const NaiveBayesTextTagger = class NaiveBayesTextTagger {
+  constructor(model, toksToTfIdfVector) {
+    this.model = model;
+    this.toksToTfIdfVector = toksToTfIdfVector;
+  }
+
+  /**
+   * Determines if the tokenized text belongs to class according to binary naive Bayes
+   * classifier. Returns an object containing the class label ("label"), and
+   * the log probability ("logProb") that the text belongs to that class. If
+   * the positive class is more likely, then "label" is the positive class
+   * label. If the negative class is matched, then "label" is set to null.
+   */
+  tagTokens(tokens) {
+    let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs);
+
+    let bestLogProb = null;
+    let bestClassId = -1;
+    let bestClassLabel = null;
+    let logSumExp = 0.0; // will be P(x). Used to create a proper probability
+    for (let classId = 0; classId < this.model.classes.length; classId++) {
+      let classModel = this.model.classes[classId];
+      let classLogProb = classModel.log_prior;
+
+      // dot fv with the class model
+      for (let pair of Object.values(fv)) {
+        let [termId, tfidf] = pair;
+        classLogProb += tfidf * classModel.feature_log_probs[termId];
+      }
+
+      if (bestLogProb === null || classLogProb > bestLogProb) {
+        bestLogProb = classLogProb;
+        bestClassId = classId;
+      }
+      logSumExp += Math.exp(classLogProb);
+    }
+
+    // now normalize the probability by dividing by P(x)
+    logSumExp = Math.log(logSumExp);
+    bestLogProb -= logSumExp;
+    if (bestClassId === this.model.positive_class_id) {
+      bestClassLabel = this.model.positive_class_label;
+    } else {
+      bestClassLabel = null;
+    }
+
+    let confident =
+      bestClassId === this.model.positive_class_id &&
+      bestLogProb > this.model.positive_class_threshold_log_prob;
+    return {
+      label: bestClassLabel,
+      logProb: bestLogProb,
+      confident,
+    };
+  }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm b/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm
new file mode 100644
index 0000000000..639c92b6e4
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm
@@ -0,0 +1,65 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["NmfTextTagger"];
+
+const NmfTextTagger = class NmfTextTagger {
+  constructor(model, toksToTfIdfVector) {
+    this.model = model;
+    this.toksToTfIdfVector = toksToTfIdfVector;
+  }
+
+  /**
+   * A multiclass classifier that scores tokenized text against several classes through
+   * inference of a nonnegative matrix factorization of TF-IDF vectors and
+   * class labels. Returns a map of class labels as string keys to scores.
+   * (Higher is more confident.) All classes get scored, so it is up to
+   * consumer of this data determine what classes are most valuable.
+   */
+  tagTokens(tokens) {
+    let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs);
+    let fve = Object.values(fv);
+
+    // normalize by the sum of the vector
+    let sum = 0.0;
+    for (let pair of fve) {
+      // eslint-disable-next-line prefer-destructuring
+      sum += pair[1];
+    }
+    for (let i = 0; i < fve.length; i++) {
+      // eslint-disable-next-line prefer-destructuring
+      fve[i][1] /= sum;
+    }
+
+    // dot the document with each topic vector so that we can transform it into
+    // the latent space
+    let toksInLatentSpace = [];
+    for (let topicVect of this.model.topic_word) {
+      let fvDotTwv = 0;
+      // dot fv with each topic word vector
+      for (let pair of fve) {
+        let [termId, tfidf] = pair;
+        fvDotTwv += tfidf * topicVect[termId];
+      }
+      toksInLatentSpace.push(fvDotTwv);
+    }
+
+    // now project toksInLatentSpace back into class space
+    let predictions = {};
+    Object.keys(this.model.document_topic).forEach(topic => {
+      let score = 0;
+      for (let i = 0; i < toksInLatentSpace.length; i++) {
+        score += toksInLatentSpace[i] * this.model.document_topic[topic][i];
+      }
+      predictions[topic] = score;
+    });
+
+    return predictions;
+  }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm b/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm
new file mode 100644
index 0000000000..c1f54408f2
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm
@@ -0,0 +1,282 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"use strict";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  NewTabUtils: "resource://gre/modules/NewTabUtils.sys.mjs",
+  RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
+  Utils: "resource://services-settings/Utils.sys.mjs",
+});
+
+const { BasePromiseWorker } = ChromeUtils.importESModule(
+  "resource://gre/modules/PromiseWorker.sys.mjs"
+);
+
+const RECIPE_NAME = "personality-provider-recipe";
+const MODELS_NAME = "personality-provider-models";
+
+class PersonalityProvider {
+  constructor(modelKeys) {
+    this.modelKeys = modelKeys;
+    this.onSync = this.onSync.bind(this);
+    this.setup();
+  }
+
+  setScores(scores) {
+    this.scores = scores || {};
+    this.interestConfig = this.scores.interestConfig;
+    this.interestVector = this.scores.interestVector;
+  }
+
+  get personalityProviderWorker() {
+    if (this._personalityProviderWorker) {
+      return this._personalityProviderWorker;
+    }
+
+    this._personalityProviderWorker = new BasePromiseWorker(
+      "resource://activity-stream/lib/PersonalityProvider/PersonalityProviderWorker.js"
+    );
+
+    return this._personalityProviderWorker;
+  }
+
+  get baseAttachmentsURL() {
+    // Returning a promise, so we can have an async getter.
+    return this._getBaseAttachmentsURL();
+  }
+
+  async _getBaseAttachmentsURL() {
+    if (this._baseAttachmentsURL) {
+      return this._baseAttachmentsURL;
+    }
+    const server = lazy.Utils.SERVER_URL;
+    const serverInfo = await (
+      await fetch(`${server}/`, {
+        credentials: "omit",
+      })
+    ).json();
+    const {
+      capabilities: {
+        attachments: { base_url },
+      },
+    } = serverInfo;
+    this._baseAttachmentsURL = base_url;
+    return this._baseAttachmentsURL;
+  }
+
+  setup() {
+    this.setupSyncAttachment(RECIPE_NAME);
+    this.setupSyncAttachment(MODELS_NAME);
+  }
+
+  teardown() {
+    this.teardownSyncAttachment(RECIPE_NAME);
+    this.teardownSyncAttachment(MODELS_NAME);
+    if (this._personalityProviderWorker) {
+      this._personalityProviderWorker.terminate();
+    }
+  }
+
+  setupSyncAttachment(collection) {
+    lazy.RemoteSettings(collection).on("sync", this.onSync);
+  }
+
+  teardownSyncAttachment(collection) {
+    lazy.RemoteSettings(collection).off("sync", this.onSync);
+  }
+
+  onSync(event) {
+    this.personalityProviderWorker.post("onSync", [event]);
+  }
+
+  /**
+   * Gets contents of the attachment if it already exists on file,
+   * and if not attempts to download it.
+   */
+  getAttachment(record) {
+    return this.personalityProviderWorker.post("getAttachment", [record]);
+  }
+
+  /**
+   * Returns a Recipe from remote settings to be consumed by a RecipeExecutor.
+   * A Recipe is a set of instructions on how to processes a RecipeExecutor.
+   */
+  async getRecipe() {
+    if (!this.recipes || !this.recipes.length) {
+      const result = await lazy.RemoteSettings(RECIPE_NAME).get();
+      this.recipes = await Promise.all(
+        result.map(async record => ({
+          ...(await this.getAttachment(record)),
+          recordKey: record.key,
+        }))
+      );
+    }
+    return this.recipes[0];
+  }
+
+  /**
+   * Grabs a slice of browse history for building a interest vector
+   */
+  async fetchHistory(columns, beginTimeSecs, endTimeSecs) {
+    let sql = `SELECT url, title, visit_count, frecency, last_visit_date, description
+    FROM moz_places
+    WHERE last_visit_date >= ${beginTimeSecs * 1000000}
+    AND last_visit_date < ${endTimeSecs * 1000000}`;
+    columns.forEach(requiredColumn => {
+      sql += ` AND IFNULL(${requiredColumn}, '') <> ''`;
+    });
+    sql += " LIMIT 30000";
+
+    const { activityStreamProvider } = lazy.NewTabUtils;
+    const history = await activityStreamProvider.executePlacesQuery(sql, {
+      columns,
+      params: {},
+    });
+
+    return history;
+  }
+
+  /**
+   * Handles setup and metrics of history fetch.
+   */
+  async getHistory() {
+    let endTimeSecs = new Date().getTime() / 1000;
+    let beginTimeSecs = endTimeSecs - this.interestConfig.history_limit_secs;
+    if (
+      !this.interestConfig ||
+      !this.interestConfig.history_required_fields ||
+      !this.interestConfig.history_required_fields.length
+    ) {
+      return [];
+    }
+    let history = await this.fetchHistory(
+      this.interestConfig.history_required_fields,
+      beginTimeSecs,
+      endTimeSecs
+    );
+
+    return history;
+  }
+
+  async setBaseAttachmentsURL() {
+    await this.personalityProviderWorker.post("setBaseAttachmentsURL", [
+      await this.baseAttachmentsURL,
+    ]);
+  }
+
+  async setInterestConfig() {
+    this.interestConfig = this.interestConfig || (await this.getRecipe());
+    await this.personalityProviderWorker.post("setInterestConfig", [
+      this.interestConfig,
+    ]);
+  }
+
+  async setInterestVector() {
+    await this.personalityProviderWorker.post("setInterestVector", [
+      this.interestVector,
+    ]);
+  }
+
+  async fetchModels() {
+    const models = await lazy.RemoteSettings(MODELS_NAME).get();
+    return this.personalityProviderWorker.post("fetchModels", [models]);
+  }
+
+  async generateTaggers() {
+    await this.personalityProviderWorker.post("generateTaggers", [
+      this.modelKeys,
+    ]);
+  }
+
+  async generateRecipeExecutor() {
+    await this.personalityProviderWorker.post("generateRecipeExecutor");
+  }
+
+  async createInterestVector() {
+    const history = await this.getHistory();
+
+    const interestVectorResult = await this.personalityProviderWorker.post(
+      "createInterestVector",
+      [history]
+    );
+
+    return interestVectorResult;
+  }
+
+  async init(callback) {
+    await this.setBaseAttachmentsURL();
+    await this.setInterestConfig();
+    if (!this.interestConfig) {
+      return;
+    }
+
+    // We always generate a recipe executor, no cache used here.
+    // This is because the result of this is an object with
+    // functions (taggers) so storing it in cache is not possible.
+    // Thus we cannot use it to rehydrate anything.
+    const fetchModelsResult = await this.fetchModels();
+    // If this fails, log an error and return.
+    if (!fetchModelsResult.ok) {
+      return;
+    }
+    await this.generateTaggers();
+    await this.generateRecipeExecutor();
+
+    // If we don't have a cached vector, create a new one.
+    if (!this.interestVector) {
+      const interestVectorResult = await this.createInterestVector();
+      // If that failed, log an error and return.
+      if (!interestVectorResult.ok) {
+        return;
+      }
+      this.interestVector = interestVectorResult.interestVector;
+    }
+
+    // This happens outside the createInterestVector call above,
+    // because create can be skipped if rehydrating from cache.
+    // In that case, the interest vector is provided and not created, so we just set it.
+    await this.setInterestVector();
+
+    this.initialized = true;
+    if (callback) {
+      callback();
+    }
+  }
+
+  async calculateItemRelevanceScore(pocketItem) {
+    if (!this.initialized) {
+      return pocketItem.item_score || 1;
+    }
+    const itemRelevanceScore = await this.personalityProviderWorker.post(
+      "calculateItemRelevanceScore",
+      [pocketItem]
+    );
+    if (!itemRelevanceScore) {
+      return -1;
+    }
+    const { scorableItem, rankingVector } = itemRelevanceScore;
+    // Put the results on the item for debugging purposes.
+    pocketItem.scorableItem = scorableItem;
+    pocketItem.rankingVector = rankingVector;
+    return rankingVector.score;
+  }
+
+  /**
+   * Returns an object holding the personalization scores of this provider instance.
+   */
+  getScores() {
+    return {
+      // We cannot return taggers here.
+      // What we return here goes into persistent cache, and taggers have functions on it.
+      // If we attempted to save taggers into persistent cache, it would store it to disk,
+      // and the next time we load it, it would start thowing function is not defined.
+      interestConfig: this.interestConfig,
+      interestVector: this.interestVector,
+    };
+  }
+}
+
+const EXPORTED_SYMBOLS = ["PersonalityProvider"];
diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js
new file mode 100644
index 0000000000..3ed118857e
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js
@@ -0,0 +1,44 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* eslint-env mozilla/chrome-worker */
+
+"use strict";
+
+// Order of these are important.
+/* import-globals-from /toolkit/components/workerloader/require.js */
+/* import-globals-from Tokenize.jsm */
+/* import-globals-from NaiveBayesTextTagger.jsm */
+/* import-globals-from NmfTextTagger.jsm */
+/* import-globals-from RecipeExecutor.jsm */
+/* import-globals-from PersonalityProviderWorkerClass.jsm */
+importScripts(
+  "resource://gre/modules/workers/require.js",
+  "resource://activity-stream/lib/PersonalityProvider/Tokenize.jsm",
+  "resource://activity-stream/lib/PersonalityProvider/NaiveBayesTextTagger.jsm",
+  "resource://activity-stream/lib/PersonalityProvider/NmfTextTagger.jsm",
+  "resource://activity-stream/lib/PersonalityProvider/RecipeExecutor.jsm",
+  "resource://activity-stream/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm"
+);
+
+const PromiseWorker = require("resource://gre/modules/workers/PromiseWorker.js");
+
+const personalityProviderWorker = new PersonalityProviderWorker();
+
+// This is boiler plate worker stuff that connects it to the main thread PromiseWorker.
+const worker = new PromiseWorker.AbstractWorker();
+worker.dispatch = function (method, args = []) {
+  return personalityProviderWorker[method](...args);
+};
+worker.postMessage = function (message, ...transfers) {
+  self.postMessage(message, ...transfers);
+};
+worker.close = function () {
+  self.close();
+};
+
+self.addEventListener("message", msg => worker.handleMessage(msg));
+self.addEventListener("unhandledrejection", function (error) {
+  throw error.reason;
+});
diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm
new file mode 100644
index 0000000000..e761f827d2
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm
@@ -0,0 +1,311 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// PersonalityProviderWorker.js imports the following scripts before this.
+/* import-globals-from Tokenize.jsm */
+/* import-globals-from NaiveBayesTextTagger.jsm */
+/* import-globals-from NmfTextTagger.jsm */
+/* import-globals-from RecipeExecutor.jsm */
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["PersonalityProviderWorker"];
+
+// A helper function to create a hash out of a file.
+async function _getFileHash(filepath) {
+  const data = await IOUtils.read(filepath);
+  // File is an instance of Uint8Array
+  const digest = await crypto.subtle.digest("SHA-256", data);
+  const uint8 = new Uint8Array(digest);
+  // return the two-digit hexadecimal code for a byte
+  const toHex = b => b.toString(16).padStart(2, "0");
+  return Array.from(uint8, toHex).join("");
+}
+
+/**
+ * V2 provider builds and ranks an interest profile (also called an “interest vector”) off the browse history.
+ * This allows Firefox to classify pages into topics, by examining the text found on the page.
+ * It does this by looking at the history text content, title, and description.
+ */
+const PersonalityProviderWorker = class PersonalityProviderWorker {
+  async getPersonalityProviderDir() {
+    const personalityProviderDir = PathUtils.join(
+      await PathUtils.getLocalProfileDir(),
+      "personality-provider"
+    );
+
+    // Cache this so we don't need to await again.
+    this.getPersonalityProviderDir = () =>
+      Promise.resolve(personalityProviderDir);
+    return personalityProviderDir;
+  }
+
+  setBaseAttachmentsURL(url) {
+    this.baseAttachmentsURL = url;
+  }
+
+  setInterestConfig(interestConfig) {
+    this.interestConfig = interestConfig;
+  }
+
+  setInterestVector(interestVector) {
+    this.interestVector = interestVector;
+  }
+
+  onSync(event) {
+    const {
+      data: { created, updated, deleted },
+    } = event;
+    // Remove every removed attachment.
+    const toRemove = deleted.concat(updated.map(u => u.old));
+    toRemove.forEach(record => this.deleteAttachment(record));
+
+    // Download every new/updated attachment.
+    const toDownload = created.concat(updated.map(u => u.new));
+    // maybeDownloadAttachment is async but we don't care inside onSync.
+    toDownload.forEach(record => this.maybeDownloadAttachment(record));
+  }
+
+  /**
+   * Attempts to download the attachment, but only if it doesn't already exist.
+   */
+  async maybeDownloadAttachment(record, retries = 3) {
+    const {
+      attachment: { filename, hash, size },
+    } = record;
+    await IOUtils.makeDirectory(await this.getPersonalityProviderDir());
+    const localFilePath = PathUtils.join(
+      await this.getPersonalityProviderDir(),
+      filename
+    );
+
+    let retry = 0;
+    while (
+      retry++ < retries &&
+      // exists is an issue for perf because I might not need to call it.
+      (!(await IOUtils.exists(localFilePath)) ||
+        (await IOUtils.stat(localFilePath)).size !== size ||
+        (await _getFileHash(localFilePath)) !== hash)
+    ) {
+      await this._downloadAttachment(record);
+    }
+  }
+
+  /**
+   * Downloads the attachment to disk assuming the dir already exists
+   * and any existing files matching the filename are clobbered.
+   */
+  async _downloadAttachment(record) {
+    const {
+      attachment: { location, filename },
+    } = record;
+    const remoteFilePath = this.baseAttachmentsURL + location;
+    const localFilePath = PathUtils.join(
+      await this.getPersonalityProviderDir(),
+      filename
+    );
+
+    const xhr = new XMLHttpRequest();
+    // Set false here for a synchronous request, because we're in a worker.
+    xhr.open("GET", remoteFilePath, false);
+    xhr.setRequestHeader("Accept-Encoding", "gzip");
+    xhr.responseType = "arraybuffer";
+    xhr.withCredentials = false;
+    xhr.send(null);
+
+    if (xhr.status !== 200) {
+      console.error(`Failed to fetch ${remoteFilePath}: ${xhr.statusText}`);
+      return;
+    }
+
+    const buffer = xhr.response;
+    const bytes = new Uint8Array(buffer);
+
+    await IOUtils.write(localFilePath, bytes, {
+      tmpPath: `${localFilePath}.tmp`,
+    });
+  }
+
+  async deleteAttachment(record) {
+    const {
+      attachment: { filename },
+    } = record;
+    await IOUtils.makeDirectory(await this.getPersonalityProviderDir());
+    const path = PathUtils.join(
+      await this.getPersonalityProviderDir(),
+      filename
+    );
+
+    await IOUtils.remove(path, { ignoreAbsent: true });
+    // Cleanup the directory if it is empty, do nothing if it is not empty.
+    try {
+      await IOUtils.remove(await this.getPersonalityProviderDir(), {
+        ignoreAbsent: true,
+      });
+    } catch (e) {
+      // This is likely because the directory is not empty, so we don't care.
+    }
+  }
+
+  /**
+   * Gets contents of the attachment if it already exists on file,
+   * and if not attempts to download it.
+   */
+  async getAttachment(record) {
+    const {
+      attachment: { filename },
+    } = record;
+    const filepath = PathUtils.join(
+      await this.getPersonalityProviderDir(),
+      filename
+    );
+
+    try {
+      await this.maybeDownloadAttachment(record);
+      return await IOUtils.readJSON(filepath);
+    } catch (error) {
+      console.error(`Failed to load ${filepath}: ${error.message}`);
+    }
+    return {};
+  }
+
+  async fetchModels(models) {
+    this.models = await Promise.all(
+      models.map(async record => ({
+        ...(await this.getAttachment(record)),
+        recordKey: record.key,
+      }))
+    );
+    if (!this.models.length) {
+      return {
+        ok: false,
+      };
+    }
+    return {
+      ok: true,
+    };
+  }
+
+  generateTaggers(modelKeys) {
+    if (!this.taggers) {
+      let nbTaggers = [];
+      let nmfTaggers = {};
+
+      for (let model of this.models) {
+        if (!modelKeys.includes(model.recordKey)) {
+          continue;
+        }
+        if (model.model_type === "nb") {
+          nbTaggers.push(new NaiveBayesTextTagger(model, toksToTfIdfVector));
+        } else if (model.model_type === "nmf") {
+          nmfTaggers[model.parent_tag] = new NmfTextTagger(
+            model,
+            toksToTfIdfVector
+          );
+        }
+      }
+      this.taggers = { nbTaggers, nmfTaggers };
+    }
+  }
+
+  /**
+   * Sets and generates a Recipe Executor.
+   * A Recipe Executor is a set of actions that can be consumed by a Recipe.
+   * The Recipe determines the order and specifics of which the actions are called.
+   */
+  generateRecipeExecutor() {
+    const recipeExecutor = new RecipeExecutor(
+      this.taggers.nbTaggers,
+      this.taggers.nmfTaggers,
+      tokenize
+    );
+    this.recipeExecutor = recipeExecutor;
+  }
+
+  /**
+   * Examines the user's browse history and returns an interest vector that
+   * describes the topics the user frequently browses.
+   */
+  createInterestVector(history) {
+    let interestVector = {};
+
+    for (let historyRec of history) {
+      let ivItem = this.recipeExecutor.executeRecipe(
+        historyRec,
+        this.interestConfig.history_item_builder
+      );
+      if (ivItem === null) {
+        continue;
+      }
+      interestVector = this.recipeExecutor.executeCombinerRecipe(
+        interestVector,
+        ivItem,
+        this.interestConfig.interest_combiner
+      );
+      if (interestVector === null) {
+        return null;
+      }
+    }
+
+    const finalResult = this.recipeExecutor.executeRecipe(
+      interestVector,
+      this.interestConfig.interest_finalizer
+    );
+
+    return {
+      ok: true,
+      interestVector: finalResult,
+    };
+  }
+
+  /**
+   * Calculates a score of a Pocket item when compared to the user's interest
+   * vector. Returns the score. Higher scores are better. Assumes this.interestVector
+   * is populated.
+   */
+  calculateItemRelevanceScore(pocketItem) {
+    const { personalization_models } = pocketItem;
+    let scorableItem;
+
+    // If the server provides some models, we can just use them,
+    // and skip generating them.
+    if (personalization_models && Object.keys(personalization_models).length) {
+      scorableItem = {
+        id: pocketItem.id,
+        item_tags: personalization_models,
+        item_score: pocketItem.item_score,
+        item_sort_id: 1,
+      };
+    } else {
+      scorableItem = this.recipeExecutor.executeRecipe(
+        pocketItem,
+        this.interestConfig.item_to_rank_builder
+      );
+      if (scorableItem === null) {
+        return null;
+      }
+    }
+
+    // We're doing a deep copy on an object.
+    let rankingVector = JSON.parse(JSON.stringify(this.interestVector));
+
+    Object.keys(scorableItem).forEach(key => {
+      rankingVector[key] = scorableItem[key];
+    });
+
+    rankingVector = this.recipeExecutor.executeRecipe(
+      rankingVector,
+      this.interestConfig.item_ranker
+    );
+
+    if (rankingVector === null) {
+      return null;
+    }
+
+    return { scorableItem, rankingVector };
+  }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm b/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm
new file mode 100644
index 0000000000..9dbf8b802d
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm
@@ -0,0 +1,1126 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["RecipeExecutor"];
+
+/**
+ * RecipeExecutor is the core feature engineering pipeline for the in-browser
+ * personalization work. These pipelines are called "recipes". A recipe is an
+ * array of objects that define a "step" in the recipe. A step is simply an
+ * object with a field "function" that specifies what is being done in the step
+ * along with other fields that are semantically defined for that step.
+ *
+ * There are two types of recipes "builder" recipes and "combiner" recipes. Builder
+ * recipes mutate an object until it matches some set of critera. Combiner
+ * recipes take two objects, (a "left" and a "right"), and specify the steps
+ * to merge the right object into the left object.
+ *
+ * A short nonsense example recipe is:
+ * [ {"function": "get_url_domain", "path_length": 1, "field": "url", "dest": "url_domain"},
+ *   {"function": "nb_tag", "fields": ["title", "description"]},
+ *   {"function": "conditionally_nmf_tag", "fields": ["title", "description"]} ]
+ *
+ * Recipes are sandboxed by the fact that the step functions must be explicitly
+ * allowed. Functions allowed for builder recipes are specifed in the
+ * RecipeExecutor.ITEM_BUILDER_REGISTRY, while combiner functions are allowed
+ * in RecipeExecutor.ITEM_COMBINER_REGISTRY .
+ */
+const RecipeExecutor = class RecipeExecutor {
+  constructor(nbTaggers, nmfTaggers, tokenize) {
+    this.ITEM_BUILDER_REGISTRY = {
+      nb_tag: this.naiveBayesTag,
+      conditionally_nmf_tag: this.conditionallyNmfTag,
+      accept_item_by_field_value: this.acceptItemByFieldValue,
+      tokenize_url: this.tokenizeUrl,
+      get_url_domain: this.getUrlDomain,
+      tokenize_field: this.tokenizeField,
+      copy_value: this.copyValue,
+      keep_top_k: this.keepTopK,
+      scalar_multiply: this.scalarMultiply,
+      elementwise_multiply: this.elementwiseMultiply,
+      vector_multiply: this.vectorMultiply,
+      scalar_add: this.scalarAdd,
+      vector_add: this.vectorAdd,
+      make_boolean: this.makeBoolean,
+      allow_fields: this.allowFields,
+      filter_by_value: this.filterByValue,
+      l2_normalize: this.l2Normalize,
+      prob_normalize: this.probNormalize,
+      set_default: this.setDefault,
+      lookup_value: this.lookupValue,
+      copy_to_map: this.copyToMap,
+      scalar_multiply_tag: this.scalarMultiplyTag,
+      apply_softmax_tags: this.applySoftmaxTags,
+    };
+    this.ITEM_COMBINER_REGISTRY = {
+      combiner_add: this.combinerAdd,
+      combiner_max: this.combinerMax,
+      combiner_collect_values: this.combinerCollectValues,
+    };
+    this.nbTaggers = nbTaggers;
+    this.nmfTaggers = nmfTaggers;
+    this.tokenize = tokenize;
+  }
+
+  /**
+   * Determines the type of a field. Valid types are:
+   *   string
+   *   number
+   *   array
+   *   map (strings to anything)
+   */
+  _typeOf(data) {
+    let t = typeof data;
+    if (t === "object") {
+      if (data === null) {
+        return "null";
+      }
+      if (Array.isArray(data)) {
+        return "array";
+      }
+      return "map";
+    }
+    return t;
+  }
+
+  /**
+   * Returns a scalar, either because it was a constant, or by
+   * looking it up from the item. Allows for a default value if the lookup
+   * fails.
+   */
+  _lookupScalar(item, k, dfault) {
+    if (this._typeOf(k) === "number") {
+      return k;
+    } else if (
+      this._typeOf(k) === "string" &&
+      k in item &&
+      this._typeOf(item[k]) === "number"
+    ) {
+      return item[k];
+    }
+    return dfault;
+  }
+
+  /**
+   * Simply appends all the strings from a set fields together. If the field
+   * is a list, then the cells of the list are append.
+   */
+  _assembleText(item, fields) {
+    let textArr = [];
+    for (let field of fields) {
+      if (field in item) {
+        let type = this._typeOf(item[field]);
+        if (type === "string") {
+          textArr.push(item[field]);
+        } else if (type === "array") {
+          for (let ele of item[field]) {
+            textArr.push(String(ele));
+          }
+        } else {
+          textArr.push(String(item[field]));
+        }
+      }
+    }
+    return textArr.join(" ");
+  }
+
+  /**
+   * Runs the naive bayes text taggers over a set of text fields. Stores the
+   * results in new fields:
+   *  nb_tags:         a map of text strings to probabilites
+   *  nb_tokens:       the tokenized text that was tagged
+   *
+   * Config:
+   *  fields:          an array containing a list of fields to concatenate and tag
+   */
+  naiveBayesTag(item, config) {
+    let text = this._assembleText(item, config.fields);
+    let tokens = this.tokenize(text);
+    let tags = {};
+    let extended_tags = {};
+
+    for (let nbTagger of this.nbTaggers) {
+      let result = nbTagger.tagTokens(tokens);
+      if (result.label !== null && result.confident) {
+        extended_tags[result.label] = result;
+        tags[result.label] = Math.exp(result.logProb);
+      }
+    }
+    item.nb_tags = tags;
+    item.nb_tags_extended = extended_tags;
+    item.nb_tokens = tokens;
+    return item;
+  }
+
+  /**
+   * Selectively runs NMF text taggers depending on which tags were found
+   * by the naive bayes taggers. Writes the results in into new fields:
+   *  nmf_tags_parent_weights:  map of pareent tags to probabilites of those parent tags
+   *  nmf_tags:                 map of strings to maps of strings to probabilities
+   *  nmf_tags_parent           map of child tags to parent tags
+   *
+   * Config:
+   *  Not configurable
+   */
+  conditionallyNmfTag(item, config) {
+    let nestedNmfTags = {};
+    let parentTags = {};
+    let parentWeights = {};
+
+    if (!("nb_tags" in item) || !("nb_tokens" in item)) {
+      return null;
+    }
+
+    Object.keys(item.nb_tags).forEach(parentTag => {
+      let nmfTagger = this.nmfTaggers[parentTag];
+      if (nmfTagger !== undefined) {
+        nestedNmfTags[parentTag] = {};
+        parentWeights[parentTag] = item.nb_tags[parentTag];
+        let nmfTags = nmfTagger.tagTokens(item.nb_tokens);
+        Object.keys(nmfTags).forEach(nmfTag => {
+          nestedNmfTags[parentTag][nmfTag] = nmfTags[nmfTag];
+          parentTags[nmfTag] = parentTag;
+        });
+      }
+    });
+
+    item.nmf_tags = nestedNmfTags;
+    item.nmf_tags_parent = parentTags;
+    item.nmf_tags_parent_weights = parentWeights;
+
+    return item;
+  }
+
+  /**
+   * Checks a field's value against another value (either from another field
+   * or a constant). If the test passes, then the item is emitted, otherwise
+   * the pipeline is aborted.
+   *
+   * Config:
+   *  field      Field to read the value to test. Left side of operator.
+   *  op         one of ==, !=, <, <=, >, >=
+   *  rhsValue   Constant value to compare against. Right side of operator.
+   *  rhsField   Field to read value to compare against. Right side of operator.
+   *
+   * NOTE: rhsValue takes precidence over rhsField.
+   */
+  acceptItemByFieldValue(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let rhs = null;
+    if ("rhsValue" in config) {
+      rhs = config.rhsValue;
+    } else if ("rhsField" in config && config.rhsField in item) {
+      rhs = item[config.rhsField];
+    }
+    if (rhs === null) {
+      return null;
+    }
+
+    if (
+      // eslint-disable-next-line eqeqeq
+      (config.op === "==" && item[config.field] == rhs) ||
+      // eslint-disable-next-line eqeqeq
+      (config.op === "!=" && item[config.field] != rhs) ||
+      (config.op === "<" && item[config.field] < rhs) ||
+      (config.op === "<=" && item[config.field] <= rhs) ||
+      (config.op === ">" && item[config.field] > rhs) ||
+      (config.op === ">=" && item[config.field] >= rhs)
+    ) {
+      return item;
+    }
+
+    return null;
+  }
+
+  /**
+   * Splits a URL into text-like tokens.
+   *
+   * Config:
+   *  field   Field containing a URL
+   *  dest    Field to write the tokens to as an array of strings
+   *
+   * NOTE: Any initial 'www' on the hostname is removed.
+   */
+  tokenizeUrl(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+
+    let url = new URL(item[config.field]);
+    let domain = url.hostname;
+    if (domain.startsWith("www.")) {
+      domain = domain.substring(4);
+    }
+    let toks = this.tokenize(domain);
+    let pathToks = this.tokenize(
+      decodeURIComponent(url.pathname.replace(/\+/g, " "))
+    );
+    for (let tok of pathToks) {
+      toks.push(tok);
+    }
+    for (let pair of url.searchParams.entries()) {
+      let k = this.tokenize(decodeURIComponent(pair[0].replace(/\+/g, " ")));
+      for (let tok of k) {
+        toks.push(tok);
+      }
+      if (pair[1] !== null && pair[1] !== "") {
+        let v = this.tokenize(decodeURIComponent(pair[1].replace(/\+/g, " ")));
+        for (let tok of v) {
+          toks.push(tok);
+        }
+      }
+    }
+    item[config.dest] = toks;
+
+    return item;
+  }
+
+  /**
+   * Gets the hostname (minus any initial "www." along with the left most
+   * directories on the path.
+   *
+   * Config:
+   *  field          Field containing the URL
+   *  dest           Field to write the array of strings to
+   *  path_length    OPTIONAL (DEFAULT: 0) Number of leftmost subdirectories to include
+   */
+  getUrlDomain(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+
+    let url = new URL(item[config.field]);
+    let domain = url.hostname.toLocaleLowerCase();
+    if (domain.startsWith("www.")) {
+      domain = domain.substring(4);
+    }
+    item[config.dest] = domain;
+    let pathLength = 0;
+    if ("path_length" in config) {
+      pathLength = config.path_length;
+    }
+    if (pathLength > 0) {
+      item[config.dest] += url.pathname
+        .toLocaleLowerCase()
+        .split("/")
+        .slice(0, pathLength + 1)
+        .join("/");
+    }
+
+    return item;
+  }
+
+  /**
+   * Splits a field into tokens.
+   * Config:
+   *  field         Field containing a string to tokenize
+   *  dest          Field to write the array of strings to
+   */
+  tokenizeField(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+
+    item[config.dest] = this.tokenize(item[config.field]);
+
+    return item;
+  }
+
+  /**
+   * Deep copy from one field to another.
+   * Config:
+   *  src           Field to read from
+   *  dest          Field to write to
+   */
+  copyValue(item, config) {
+    if (!(config.src in item)) {
+      return null;
+    }
+
+    item[config.dest] = JSON.parse(JSON.stringify(item[config.src]));
+
+    return item;
+  }
+
+  /**
+   * Converts a field containing a map of strings to a map of strings
+   * to numbers, to a map of strings to numbers containing at most k elements.
+   * This operation is performed by first, promoting all the subkeys up one
+   * level, and then taking the top (or bottom) k values.
+   *
+   * Config:
+   *  field         Points to a map of strings to a map of strings to numbers
+   *  k             Maximum number of items to keep
+   *  descending    OPTIONAL (DEFAULT: True) Sorts score in descending  order
+   *                  (i.e. keeps maximum)
+   */
+  keepTopK(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let k = this._lookupScalar(item, config.k, 1048576);
+    let descending = !("descending" in config) || config.descending !== false;
+
+    // we can't sort by the values in the map, so we have to convert this
+    // to an array, and then sort.
+    let sortable = [];
+    Object.keys(item[config.field]).forEach(outerKey => {
+      let innerType = this._typeOf(item[config.field][outerKey]);
+      if (innerType === "map") {
+        Object.keys(item[config.field][outerKey]).forEach(innerKey => {
+          sortable.push({
+            key: innerKey,
+            value: item[config.field][outerKey][innerKey],
+          });
+        });
+      } else {
+        sortable.push({ key: outerKey, value: item[config.field][outerKey] });
+      }
+    });
+
+    sortable.sort((a, b) => {
+      if (descending) {
+        return b.value - a.value;
+      }
+      return a.value - b.value;
+    });
+
+    // now take the top k
+    let newMap = {};
+    let i = 0;
+    for (let pair of sortable) {
+      if (i >= k) {
+        break;
+      }
+      newMap[pair.key] = pair.value;
+      i++;
+    }
+    item[config.field] = newMap;
+
+    return item;
+  }
+
+  /**
+   * Scalar multiplies a vector by some constant
+   *
+   * Config:
+   *  field         Points to:
+   *                   a map of strings to numbers
+   *                   an array of numbers
+   *                   a number
+   *  k             Either a number, or a string. If it's a number then This
+   *                  is the scalar value to multiply by. If it's a string,
+   *                  the value in the pointed to field is used.
+   *  default       OPTIONAL (DEFAULT: 0), If k is a string, and no numeric
+   *                  value is found, then use this value.
+   */
+  scalarMultiply(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let k = this._lookupScalar(item, config.k, config.dfault);
+
+    let fieldType = this._typeOf(item[config.field]);
+    if (fieldType === "number") {
+      item[config.field] *= k;
+    } else if (fieldType === "array") {
+      for (let i = 0; i < item[config.field].length; i++) {
+        item[config.field][i] *= k;
+      }
+    } else if (fieldType === "map") {
+      Object.keys(item[config.field]).forEach(key => {
+        item[config.field][key] *= k;
+      });
+    } else {
+      return null;
+    }
+
+    return item;
+  }
+
+  /**
+   * Elementwise multiplies either two maps or two arrays together, storing
+   * the result in left. If left and right are of the same type, results in an
+   * error.
+   *
+   * Maps are special case. For maps the left must be a nested map such as:
+   * { k1: { k11: 1, k12: 2}, k2: { k21: 3, k22: 4 } } and right needs to be
+   * simple map such as: { k1: 5, k2: 6} .  The operation is then to mulitply
+   * every value of every right key, to every value every subkey where the
+   * parent keys match. Using the previous examples, the result would be:
+   * { k1: { k11: 5, k12: 10 }, k2: { k21: 18, k22: 24 } } .
+   *
+   * Config:
+   *  left
+   *  right
+   */
+  elementwiseMultiply(item, config) {
+    if (!(config.left in item) || !(config.right in item)) {
+      return null;
+    }
+    let leftType = this._typeOf(item[config.left]);
+    if (leftType !== this._typeOf(item[config.right])) {
+      return null;
+    }
+    if (leftType === "array") {
+      if (item[config.left].length !== item[config.right].length) {
+        return null;
+      }
+      for (let i = 0; i < item[config.left].length; i++) {
+        item[config.left][i] *= item[config.right][i];
+      }
+    } else if (leftType === "map") {
+      Object.keys(item[config.left]).forEach(outerKey => {
+        let r = 0.0;
+        if (outerKey in item[config.right]) {
+          r = item[config.right][outerKey];
+        }
+        Object.keys(item[config.left][outerKey]).forEach(innerKey => {
+          item[config.left][outerKey][innerKey] *= r;
+        });
+      });
+    } else if (leftType === "number") {
+      item[config.left] *= item[config.right];
+    } else {
+      return null;
+    }
+
+    return item;
+  }
+
+  /**
+   * Vector multiplies (i.e. dot products) two vectors and stores the result in
+   * third field. Both vectors must either by maps, or arrays of numbers with
+   * the same length.
+   *
+   * Config:
+   *   left       A field pointing to either a map of strings to numbers,
+   *                or an array of numbers
+   *   right      A field pointing to either a map of strings to numbers,
+   *                or an array of numbers
+   *   dest       The field to store the dot product.
+   */
+  vectorMultiply(item, config) {
+    if (!(config.left in item) || !(config.right in item)) {
+      return null;
+    }
+
+    let leftType = this._typeOf(item[config.left]);
+    if (leftType !== this._typeOf(item[config.right])) {
+      return null;
+    }
+
+    let destVal = 0.0;
+    if (leftType === "array") {
+      if (item[config.left].length !== item[config.right].length) {
+        return null;
+      }
+      for (let i = 0; i < item[config.left].length; i++) {
+        destVal += item[config.left][i] * item[config.right][i];
+      }
+    } else if (leftType === "map") {
+      Object.keys(item[config.left]).forEach(key => {
+        if (key in item[config.right]) {
+          destVal += item[config.left][key] * item[config.right][key];
+        }
+      });
+    } else {
+      return null;
+    }
+
+    item[config.dest] = destVal;
+    return item;
+  }
+
+  /**
+   * Adds a constant value to all elements in the field. Mathematically,
+   * this is the same as taking a 1-vector, scalar multiplying it by k,
+   * and then vector adding it to a field.
+   *
+   * Config:
+   *  field     A field pointing to either a map of strings to numbers,
+   *                  or an array of numbers
+   *  k             Either a number, or a string. If it's a number then This
+   *                  is the scalar value to multiply by. If it's a string,
+   *                  the value in the pointed to field is used.
+   *  default       OPTIONAL (DEFAULT: 0), If k is a string, and no numeric
+   *                  value is found, then use this value.
+   */
+  scalarAdd(item, config) {
+    let k = this._lookupScalar(item, config.k, config.dfault);
+    if (!(config.field in item)) {
+      return null;
+    }
+
+    let fieldType = this._typeOf(item[config.field]);
+    if (fieldType === "array") {
+      for (let i = 0; i < item[config.field].length; i++) {
+        item[config.field][i] += k;
+      }
+    } else if (fieldType === "map") {
+      Object.keys(item[config.field]).forEach(key => {
+        item[config.field][key] += k;
+      });
+    } else if (fieldType === "number") {
+      item[config.field] += k;
+    } else {
+      return null;
+    }
+
+    return item;
+  }
+
+  /**
+   * Adds two vectors together and stores the result in left.
+   *
+   * Config:
+   *  left      A field pointing to either a map of strings to numbers,
+   *                  or an array of numbers
+   *  right     A field pointing to either a map of strings to numbers,
+   *                  or an array of numbers
+   */
+  vectorAdd(item, config) {
+    if (!(config.left in item)) {
+      return this.copyValue(item, { src: config.right, dest: config.left });
+    }
+    if (!(config.right in item)) {
+      return null;
+    }
+
+    let leftType = this._typeOf(item[config.left]);
+    if (leftType !== this._typeOf(item[config.right])) {
+      return null;
+    }
+    if (leftType === "array") {
+      if (item[config.left].length !== item[config.right].length) {
+        return null;
+      }
+      for (let i = 0; i < item[config.left].length; i++) {
+        item[config.left][i] += item[config.right][i];
+      }
+      return item;
+    } else if (leftType === "map") {
+      Object.keys(item[config.right]).forEach(key => {
+        let v = 0;
+        if (key in item[config.left]) {
+          v = item[config.left][key];
+        }
+        item[config.left][key] = v + item[config.right][key];
+      });
+      return item;
+    }
+
+    return null;
+  }
+
+  /**
+   * Converts a vector from real values to boolean integers. (i.e. either 1/0
+   * or 1/-1).
+   *
+   * Config:
+   *   field            Field containing either a map of strings to numbers or
+   *                      an array of numbers to  convert.
+   *   threshold        OPTIONAL (DEFAULT: 0) Values above this will be replaced
+   *                      with 1.0. Those below will be converted to 0.
+   *   keep_negative    OPTIONAL (DEFAULT: False) If true, values below the
+   *                      threshold will be converted to -1 instead of 0.
+   */
+  makeBoolean(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let threshold = this._lookupScalar(item, config.threshold, 0.0);
+    let type = this._typeOf(item[config.field]);
+    if (type === "array") {
+      for (let i = 0; i < item[config.field].length; i++) {
+        if (item[config.field][i] > threshold) {
+          item[config.field][i] = 1.0;
+        } else if (config.keep_negative) {
+          item[config.field][i] = -1.0;
+        } else {
+          item[config.field][i] = 0.0;
+        }
+      }
+    } else if (type === "map") {
+      Object.keys(item[config.field]).forEach(key => {
+        let value = item[config.field][key];
+        if (value > threshold) {
+          item[config.field][key] = 1.0;
+        } else if (config.keep_negative) {
+          item[config.field][key] = -1.0;
+        } else {
+          item[config.field][key] = 0.0;
+        }
+      });
+    } else if (type === "number") {
+      let value = item[config.field];
+      if (value > threshold) {
+        item[config.field] = 1.0;
+      } else if (config.keep_negative) {
+        item[config.field] = -1.0;
+      } else {
+        item[config.field] = 0.0;
+      }
+    } else {
+      return null;
+    }
+
+    return item;
+  }
+
+  /**
+   * Removes all keys from the item except for the ones specified.
+   *
+   * fields           An array of strings indicating the fields to keep
+   */
+  allowFields(item, config) {
+    let newItem = {};
+    for (let ele of config.fields) {
+      if (ele in item) {
+        newItem[ele] = item[ele];
+      }
+    }
+    return newItem;
+  }
+
+  /**
+   * Removes all keys whose value does not exceed some threshold.
+   *
+   * Config:
+   *   field         Points to a map of strings to numbers
+   *   threshold     Values must exceed this value, otherwise they are removed.
+   */
+  filterByValue(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let threshold = this._lookupScalar(item, config.threshold, 0.0);
+    let filtered = {};
+    Object.keys(item[config.field]).forEach(key => {
+      let value = item[config.field][key];
+      if (value > threshold) {
+        filtered[key] = value;
+      }
+    });
+    item[config.field] = filtered;
+
+    return item;
+  }
+
+  /**
+   * Rewrites a field so that its values are now L2 normed.
+   *
+   * Config:
+   *  field         Points to a map of strings to numbers, or an array of numbers
+   */
+  l2Normalize(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let data = item[config.field];
+    let type = this._typeOf(data);
+    if (type === "array") {
+      let norm = 0.0;
+      for (let datum of data) {
+        norm += datum * datum;
+      }
+      norm = Math.sqrt(norm);
+      if (norm !== 0) {
+        for (let i = 0; i < data.length; i++) {
+          data[i] /= norm;
+        }
+      }
+    } else if (type === "map") {
+      let norm = 0.0;
+      Object.keys(data).forEach(key => {
+        norm += data[key] * data[key];
+      });
+      norm = Math.sqrt(norm);
+      if (norm !== 0) {
+        Object.keys(data).forEach(key => {
+          data[key] /= norm;
+        });
+      }
+    } else {
+      return null;
+    }
+
+    item[config.field] = data;
+
+    return item;
+  }
+
+  /**
+   * Rewrites a field so that all of its values sum to 1.0
+   *
+   * Config:
+   *  field         Points to a map of strings to numbers, or an array of numbers
+   */
+  probNormalize(item, config) {
+    if (!(config.field in item)) {
+      return null;
+    }
+    let data = item[config.field];
+    let type = this._typeOf(data);
+    if (type === "array") {
+      let norm = 0.0;
+      for (let datum of data) {
+        norm += datum;
+      }
+      if (norm !== 0) {
+        for (let i = 0; i < data.length; i++) {
+          data[i] /= norm;
+        }
+      }
+    } else if (type === "map") {
+      let norm = 0.0;
+      Object.keys(item[config.field]).forEach(key => {
+        norm += item[config.field][key];
+      });
+      if (norm !== 0) {
+        Object.keys(item[config.field]).forEach(key => {
+          item[config.field][key] /= norm;
+        });
+      }
+    } else {
+      return null;
+    }
+
+    return item;
+  }
+
+  /**
+   * Stores a value, if it is not already present
+   *
+   * Config:
+   *  field             field to write to if it is missing
+   *  value             value to store in that field
+   */
+  setDefault(item, config) {
+    let val = this._lookupScalar(item, config.value, config.value);
+    if (!(config.field in item)) {
+      item[config.field] = val;
+    }
+
+    return item;
+  }
+
+  /**
+   * Selctively promotes an value from an inner map up to the outer map
+   *
+   * Config:
+   *  haystack            Points to a map of strings to values
+   *  needle              Key inside the map we should promote up
+   *  dest                Where we should write the value of haystack[needle]
+   */
+  lookupValue(item, config) {
+    if (config.haystack in item && config.needle in item[config.haystack]) {
+      item[config.dest] = item[config.haystack][config.needle];
+    }
+
+    return item;
+  }
+
+  /**
+   * Demotes a field into a map
+   *
+   * Config:
+   *  src               Field to copy
+   *  dest_map          Points to a map
+   *  dest_key          Key inside dest_map to copy src to
+   */
+  copyToMap(item, config) {
+    if (config.src in item) {
+      if (!(config.dest_map in item)) {
+        item[config.dest_map] = {};
+      }
+      item[config.dest_map][config.dest_key] = item[config.src];
+    }
+
+    return item;
+  }
+
+  /**
+   * Config:
+   *  field             Points to a string to number map
+   *  k                 Scalar to multiply the values by
+   *  log_scale         Boolean, if true, then the values will be transformed
+   *                      by a logrithm prior to multiplications
+   */
+  scalarMultiplyTag(item, config) {
+    let EPSILON = 0.000001;
+    if (!(config.field in item)) {
+      return null;
+    }
+    let k = this._lookupScalar(item, config.k, 1);
+    let type = this._typeOf(item[config.field]);
+    if (type === "map") {
+      Object.keys(item[config.field]).forEach(parentKey => {
+        Object.keys(item[config.field][parentKey]).forEach(key => {
+          let v = item[config.field][parentKey][key];
+          if (config.log_scale) {
+            v = Math.log(v + EPSILON);
+          }
+          item[config.field][parentKey][key] = v * k;
+        });
+      });
+    } else {
+      return null;
+    }
+
+    return item;
+  }
+
+  /**
+   * Independently applies softmax across all subtags.
+   *
+   * Config:
+   *   field        Points to a map of strings with values being another map of strings
+   */
+  applySoftmaxTags(item, config) {
+    let type = this._typeOf(item[config.field]);
+    if (type !== "map") {
+      return null;
+    }
+
+    let abort = false;
+    let softmaxSum = {};
+    Object.keys(item[config.field]).forEach(tag => {
+      if (this._typeOf(item[config.field][tag]) !== "map") {
+        abort = true;
+        return;
+      }
+      if (abort) {
+        return;
+      }
+      softmaxSum[tag] = 0;
+      Object.keys(item[config.field][tag]).forEach(subtag => {
+        if (this._typeOf(item[config.field][tag][subtag]) !== "number") {
+          abort = true;
+          return;
+        }
+        let score = item[config.field][tag][subtag];
+        softmaxSum[tag] += Math.exp(score);
+      });
+    });
+    if (abort) {
+      return null;
+    }
+
+    Object.keys(item[config.field]).forEach(tag => {
+      Object.keys(item[config.field][tag]).forEach(subtag => {
+        item[config.field][tag][subtag] =
+          Math.exp(item[config.field][tag][subtag]) / softmaxSum[tag];
+      });
+    });
+
+    return item;
+  }
+
+  /**
+   * Vector adds a field and stores the result in left.
+   *
+   * Config:
+   *   field              The field to vector add
+   */
+  combinerAdd(left, right, config) {
+    if (!(config.field in right)) {
+      return left;
+    }
+    let type = this._typeOf(right[config.field]);
+    if (!(config.field in left)) {
+      if (type === "map") {
+        left[config.field] = {};
+      } else if (type === "array") {
+        left[config.field] = [];
+      } else if (type === "number") {
+        left[config.field] = 0;
+      } else {
+        return null;
+      }
+    }
+    if (type !== this._typeOf(left[config.field])) {
+      return null;
+    }
+    if (type === "map") {
+      Object.keys(right[config.field]).forEach(key => {
+        if (!(key in left[config.field])) {
+          left[config.field][key] = 0;
+        }
+        left[config.field][key] += right[config.field][key];
+      });
+    } else if (type === "array") {
+      for (let i = 0; i < right[config.field].length; i++) {
+        if (i < left[config.field].length) {
+          left[config.field][i] += right[config.field][i];
+        } else {
+          left[config.field].push(right[config.field][i]);
+        }
+      }
+    } else if (type === "number") {
+      left[config.field] += right[config.field];
+    } else {
+      return null;
+    }
+
+    return left;
+  }
+
+  /**
+   * Stores the maximum value of the field in left.
+   *
+   * Config:
+   *   field              The field to vector add
+   */
+  combinerMax(left, right, config) {
+    if (!(config.field in right)) {
+      return left;
+    }
+    let type = this._typeOf(right[config.field]);
+    if (!(config.field in left)) {
+      if (type === "map") {
+        left[config.field] = {};
+      } else if (type === "array") {
+        left[config.field] = [];
+      } else if (type === "number") {
+        left[config.field] = 0;
+      } else {
+        return null;
+      }
+    }
+    if (type !== this._typeOf(left[config.field])) {
+      return null;
+    }
+    if (type === "map") {
+      Object.keys(right[config.field]).forEach(key => {
+        if (
+          !(key in left[config.field]) ||
+          right[config.field][key] > left[config.field][key]
+        ) {
+          left[config.field][key] = right[config.field][key];
+        }
+      });
+    } else if (type === "array") {
+      for (let i = 0; i < right[config.field].length; i++) {
+        if (i < left[config.field].length) {
+          if (left[config.field][i] < right[config.field][i]) {
+            left[config.field][i] = right[config.field][i];
+          }
+        } else {
+          left[config.field].push(right[config.field][i]);
+        }
+      }
+    } else if (type === "number") {
+      if (left[config.field] < right[config.field]) {
+        left[config.field] = right[config.field];
+      }
+    } else {
+      return null;
+    }
+
+    return left;
+  }
+
+  /**
+   * Associates a value in right with another value in right. This association
+   * is then stored in a map in left.
+   *
+   *     For example: If a sequence of rights is:
+   *     { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 41 }
+   *     { 'tags': {}, 'url_domain': 'mbusa.com/mercedes',       'time': 21 }
+   *     { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 34 }
+   *
+   *     Then assuming a 'sum' operation, left can build a map that would look like:
+   *     {
+   *         'maseratiusa.com/maserati': 75,
+   *         'mbusa.com/mercedes': 21,
+   *     }
+   *
+   * Fields:
+   *  left_field              field in the left to store / update the map
+   *  right_key_field         Field in the right to use as a key
+   *  right_value_field       Field in the right to use as a value
+   *  operation               One of "sum", "max", "overwrite", "count"
+   */
+  combinerCollectValues(left, right, config) {
+    let op;
+    if (config.operation === "sum") {
+      op = (a, b) => a + b;
+    } else if (config.operation === "max") {
+      op = (a, b) => (a > b ? a : b);
+    } else if (config.operation === "overwrite") {
+      op = (a, b) => b;
+    } else if (config.operation === "count") {
+      op = (a, b) => a + 1;
+    } else {
+      return null;
+    }
+    if (!(config.left_field in left)) {
+      left[config.left_field] = {};
+    }
+    if (
+      !(config.right_key_field in right) ||
+      !(config.right_value_field in right)
+    ) {
+      return left;
+    }
+
+    let key = right[config.right_key_field];
+    let rightValue = right[config.right_value_field];
+    let leftValue = 0.0;
+    if (key in left[config.left_field]) {
+      leftValue = left[config.left_field][key];
+    }
+
+    left[config.left_field][key] = op(leftValue, rightValue);
+
+    return left;
+  }
+
+  /**
+   * Executes a recipe. Returns an object on success, or null on failure.
+   */
+  executeRecipe(item, recipe) {
+    let newItem = item;
+    if (recipe) {
+      for (let step of recipe) {
+        let op = this.ITEM_BUILDER_REGISTRY[step.function];
+        if (op === undefined) {
+          return null;
+        }
+        newItem = op.call(this, newItem, step);
+        if (newItem === null) {
+          break;
+        }
+      }
+    }
+    return newItem;
+  }
+
+  /**
+   * Executes a recipe. Returns an object on success, or null on failure.
+   */
+  executeCombinerRecipe(item1, item2, recipe) {
+    let newItem1 = item1;
+    for (let step of recipe) {
+      let op = this.ITEM_COMBINER_REGISTRY[step.function];
+      if (op === undefined) {
+        return null;
+      }
+      newItem1 = op.call(this, newItem1, item2, step);
+      if (newItem1 === null) {
+        break;
+      }
+    }
+
+    return newItem1;
+  }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm b/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm
new file mode 100644
index 0000000000..94835557a6
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm
@@ -0,0 +1,89 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["tokenize", "toksToTfIdfVector"];
+
+// Unicode specifies certain mnemonics for code pages and character classes.
+// They call them "character properties" https://en.wikipedia.org/wiki/Unicode_character_property .
+// These mnemonics are have been adopted by many regular expression libraries,
+// however the standard Javascript regexp system doesn't support unicode
+// character properties, so we have to define these ourself.
+//
+// Each of these sections contains the characters values / ranges for specific
+// character property: Whitespace, Symbol (S), Punctuation (P), Number (N),
+// Mark (M), and Letter (L).
+const UNICODE_SPACE =
+  "\x20\xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000";
+const UNICODE_SYMBOL =
+  "\\x24\\x2B\x3C-\x3E\\x5E\x60\\x7C\x7E\xA2-\xA6\xA8\xA9\xAC\xAE-\xB1\xB4\xB8\xD7\xF7\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u0384\u0385\u03F6\u0482\u058D-\u058F\u0606-\u0608\u060B\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09F2\u09F3\u09FA\u09FB\u0AF1\u0B70\u0BF3-\u0BFA\u0C7F\u0D4F\u0D79\u0E3F\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u17DB\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u1FBD\u1FBF-\u1FC1\u1FCD-\u1FCF\u1FDD-\u1FDF\u1FED-\u1FEF\u1FFD\u1FFE\u2044\u2052\u207A-\u207C\u208A-\u208C\u20A0-\u20BE\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u2140-\u2144\u214A-\u214D\u214F\u218A\u218B\u2190-\u2307\u230C-\u2328\u232B-\u23FE\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u2767\u2794-\u27C4\u27C7-\u27E5\u27F0-\u2982\u2999-\u29D7\u29DC-\u29FB\u29FE-\u2B73\u2B76-\u2B95\u2B98-\u2BB9\u2BBD-\u2BC8\u2BCA-\u2BD1\u2BEC-\u2BEF\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u309B\u309C\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA700-\uA716\uA720\uA721\uA789\uA78A\uA828-\uA82B\uA836-\uA839\uAA77-\uAA79\uAB5B\uFB29\uFBB2-\uFBC1\uFDFC\uFDFD\uFE62\uFE64-\uFE66\uFE69\uFF04\uFF0B\uFF1C-\uFF1E\uFF3E\uFF40\uFF5C\uFF5E\uFFE0-\uFFE6\uFFE8-\uFFEE\uFFFC\uFFFD";
+const UNICODE_PUNCT =
+  "\x21-\x23\x25-\\x2A\x2C-\x2F\x3A\x3B\\x3F\x40\\x5B-\\x5D\x5F\\x7B\x7D\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E44\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65";
+
+const UNICODE_NUMBER =
+  "0-9\xB2\xB3\xB9\xBC-\xBE\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u09F4-\u09F9\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0B72-\u0B77\u0BE6-\u0BF2\u0C66-\u0C6F\u0C78-\u0C7E\u0CE6-\u0CEF\u0D58-\u0D5E\u0D66-\u0D78\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F33\u1040-\u1049\u1090-\u1099\u1369-\u137C\u16EE-\u16F0\u17E0-\u17E9\u17F0-\u17F9\u1810-\u1819\u1946-\u194F\u19D0-\u19DA\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\u2070\u2074-\u2079\u2080-\u2089\u2150-\u2182\u2185-\u2189\u2460-\u249B\u24EA-\u24FF\u2776-\u2793\u2CFD\u3007\u3021-\u3029\u3038-\u303A\u3192-\u3195\u3220-\u3229\u3248-\u324F\u3251-\u325F\u3280-\u3289\u32B1-\u32BF\uA620-\uA629\uA6E6-\uA6EF\uA830-\uA835\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19";
+const UNICODE_MARK =
+  "\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08D4-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D01-\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ABE\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2-\u1CF4\u1CF8\u1CF9\u1DC0-\u1DF5\u1DFB-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F";
+const UNICODE_LETTER =
+  "A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0-\u08B4\u08B6-\u08BD\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1877\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1C80-\u1C88\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FD5\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA7AE\uA7B0-\uA7B7\uA7F7-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uA9E0-\uA9E4\uA9E6-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB65\uAB70-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC";
+
+const REGEXP_SPLITS = new RegExp(
+  `[${UNICODE_SPACE}${UNICODE_SYMBOL}${UNICODE_PUNCT}]+`
+);
+// Match all token characters, so okay for regex to split multiple code points
+// eslint-disable-next-line no-misleading-character-class
+const REGEXP_ALPHANUMS = new RegExp(
+  `^[${UNICODE_NUMBER}${UNICODE_MARK}${UNICODE_LETTER}]+$`
+);
+
+/**
+ * Downcases the text, and splits it into consecutive alphanumeric characters.
+ * This is locale aware, and so will not strip accents. This uses "word
+ * breaks", and os is not appropriate for languages without them
+ * (e.g. Chinese).
+ */
+function tokenize(text) {
+  return text
+    .toLocaleLowerCase()
+    .split(REGEXP_SPLITS)
+    .filter(tok => tok.match(REGEXP_ALPHANUMS));
+}
+
+/**
+ * Converts a sequence of tokens into an L2 normed TF-IDF. Any terms that are
+ * not preindexed (i.e. does have a computed inverse document frequency) will
+ * be dropped.
+ */
+function toksToTfIdfVector(tokens, vocab_idfs) {
+  let tfidfs = {};
+
+  // calcualte the term frequencies
+  for (let tok of tokens) {
+    if (!(tok in vocab_idfs)) {
+      continue;
+    }
+    if (!(tok in tfidfs)) {
+      tfidfs[tok] = [vocab_idfs[tok][0], 1];
+    } else {
+      tfidfs[tok][1]++;
+    }
+  }
+
+  // now multiply by the log inverse document frequencies, then take
+  // the L2 norm of this.
+  let l2Norm = 0.0;
+  Object.keys(tfidfs).forEach(tok => {
+    tfidfs[tok][1] *= vocab_idfs[tok][1];
+    l2Norm += tfidfs[tok][1] * tfidfs[tok][1];
+  });
+  l2Norm = Math.sqrt(l2Norm);
+  Object.keys(tfidfs).forEach(tok => {
+    tfidfs[tok][1] /= l2Norm;
+  });
+
+  return tfidfs;
+}