summaryrefslogtreecommitdiffstats
path: root/browser/components/newtab/lib/PersonalityProvider
diff options
context:
space:
mode:
Diffstat (limited to 'browser/components/newtab/lib/PersonalityProvider')
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm67
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm65
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm282
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js44
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm311
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm1126
-rw-r--r--browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm89
7 files changed, 1984 insertions, 0 deletions
diff --git a/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm b/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm
new file mode 100644
index 0000000000..cc625076ba
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm
@@ -0,0 +1,67 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["NaiveBayesTextTagger"];
+
+const NaiveBayesTextTagger = class NaiveBayesTextTagger {
+ constructor(model, toksToTfIdfVector) {
+ this.model = model;
+ this.toksToTfIdfVector = toksToTfIdfVector;
+ }
+
+ /**
+ * Determines if the tokenized text belongs to class according to binary naive Bayes
+ * classifier. Returns an object containing the class label ("label"), and
+ * the log probability ("logProb") that the text belongs to that class. If
+ * the positive class is more likely, then "label" is the positive class
+ * label. If the negative class is matched, then "label" is set to null.
+ */
+ tagTokens(tokens) {
+ let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs);
+
+ let bestLogProb = null;
+ let bestClassId = -1;
+ let bestClassLabel = null;
+ let logSumExp = 0.0; // will be P(x). Used to create a proper probability
+ for (let classId = 0; classId < this.model.classes.length; classId++) {
+ let classModel = this.model.classes[classId];
+ let classLogProb = classModel.log_prior;
+
+ // dot fv with the class model
+ for (let pair of Object.values(fv)) {
+ let [termId, tfidf] = pair;
+ classLogProb += tfidf * classModel.feature_log_probs[termId];
+ }
+
+ if (bestLogProb === null || classLogProb > bestLogProb) {
+ bestLogProb = classLogProb;
+ bestClassId = classId;
+ }
+ logSumExp += Math.exp(classLogProb);
+ }
+
+ // now normalize the probability by dividing by P(x)
+ logSumExp = Math.log(logSumExp);
+ bestLogProb -= logSumExp;
+ if (bestClassId === this.model.positive_class_id) {
+ bestClassLabel = this.model.positive_class_label;
+ } else {
+ bestClassLabel = null;
+ }
+
+ let confident =
+ bestClassId === this.model.positive_class_id &&
+ bestLogProb > this.model.positive_class_threshold_log_prob;
+ return {
+ label: bestClassLabel,
+ logProb: bestLogProb,
+ confident,
+ };
+ }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm b/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm
new file mode 100644
index 0000000000..639c92b6e4
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm
@@ -0,0 +1,65 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["NmfTextTagger"];
+
+const NmfTextTagger = class NmfTextTagger {
+ constructor(model, toksToTfIdfVector) {
+ this.model = model;
+ this.toksToTfIdfVector = toksToTfIdfVector;
+ }
+
+ /**
+ * A multiclass classifier that scores tokenized text against several classes through
+ * inference of a nonnegative matrix factorization of TF-IDF vectors and
+ * class labels. Returns a map of class labels as string keys to scores.
+ * (Higher is more confident.) All classes get scored, so it is up to
+ * consumer of this data determine what classes are most valuable.
+ */
+ tagTokens(tokens) {
+ let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs);
+ let fve = Object.values(fv);
+
+ // normalize by the sum of the vector
+ let sum = 0.0;
+ for (let pair of fve) {
+ // eslint-disable-next-line prefer-destructuring
+ sum += pair[1];
+ }
+ for (let i = 0; i < fve.length; i++) {
+ // eslint-disable-next-line prefer-destructuring
+ fve[i][1] /= sum;
+ }
+
+ // dot the document with each topic vector so that we can transform it into
+ // the latent space
+ let toksInLatentSpace = [];
+ for (let topicVect of this.model.topic_word) {
+ let fvDotTwv = 0;
+ // dot fv with each topic word vector
+ for (let pair of fve) {
+ let [termId, tfidf] = pair;
+ fvDotTwv += tfidf * topicVect[termId];
+ }
+ toksInLatentSpace.push(fvDotTwv);
+ }
+
+ // now project toksInLatentSpace back into class space
+ let predictions = {};
+ Object.keys(this.model.document_topic).forEach(topic => {
+ let score = 0;
+ for (let i = 0; i < toksInLatentSpace.length; i++) {
+ score += toksInLatentSpace[i] * this.model.document_topic[topic][i];
+ }
+ predictions[topic] = score;
+ });
+
+ return predictions;
+ }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm b/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm
new file mode 100644
index 0000000000..c1f54408f2
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm
@@ -0,0 +1,282 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"use strict";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ NewTabUtils: "resource://gre/modules/NewTabUtils.sys.mjs",
+ RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
+ Utils: "resource://services-settings/Utils.sys.mjs",
+});
+
+const { BasePromiseWorker } = ChromeUtils.importESModule(
+ "resource://gre/modules/PromiseWorker.sys.mjs"
+);
+
+const RECIPE_NAME = "personality-provider-recipe";
+const MODELS_NAME = "personality-provider-models";
+
+class PersonalityProvider {
+ constructor(modelKeys) {
+ this.modelKeys = modelKeys;
+ this.onSync = this.onSync.bind(this);
+ this.setup();
+ }
+
+ setScores(scores) {
+ this.scores = scores || {};
+ this.interestConfig = this.scores.interestConfig;
+ this.interestVector = this.scores.interestVector;
+ }
+
+ get personalityProviderWorker() {
+ if (this._personalityProviderWorker) {
+ return this._personalityProviderWorker;
+ }
+
+ this._personalityProviderWorker = new BasePromiseWorker(
+ "resource://activity-stream/lib/PersonalityProvider/PersonalityProviderWorker.js"
+ );
+
+ return this._personalityProviderWorker;
+ }
+
+ get baseAttachmentsURL() {
+ // Returning a promise, so we can have an async getter.
+ return this._getBaseAttachmentsURL();
+ }
+
+ async _getBaseAttachmentsURL() {
+ if (this._baseAttachmentsURL) {
+ return this._baseAttachmentsURL;
+ }
+ const server = lazy.Utils.SERVER_URL;
+ const serverInfo = await (
+ await fetch(`${server}/`, {
+ credentials: "omit",
+ })
+ ).json();
+ const {
+ capabilities: {
+ attachments: { base_url },
+ },
+ } = serverInfo;
+ this._baseAttachmentsURL = base_url;
+ return this._baseAttachmentsURL;
+ }
+
+ setup() {
+ this.setupSyncAttachment(RECIPE_NAME);
+ this.setupSyncAttachment(MODELS_NAME);
+ }
+
+ teardown() {
+ this.teardownSyncAttachment(RECIPE_NAME);
+ this.teardownSyncAttachment(MODELS_NAME);
+ if (this._personalityProviderWorker) {
+ this._personalityProviderWorker.terminate();
+ }
+ }
+
+ setupSyncAttachment(collection) {
+ lazy.RemoteSettings(collection).on("sync", this.onSync);
+ }
+
+ teardownSyncAttachment(collection) {
+ lazy.RemoteSettings(collection).off("sync", this.onSync);
+ }
+
+ onSync(event) {
+ this.personalityProviderWorker.post("onSync", [event]);
+ }
+
+ /**
+ * Gets contents of the attachment if it already exists on file,
+ * and if not attempts to download it.
+ */
+ getAttachment(record) {
+ return this.personalityProviderWorker.post("getAttachment", [record]);
+ }
+
+ /**
+ * Returns a Recipe from remote settings to be consumed by a RecipeExecutor.
+ * A Recipe is a set of instructions on how to processes a RecipeExecutor.
+ */
+ async getRecipe() {
+ if (!this.recipes || !this.recipes.length) {
+ const result = await lazy.RemoteSettings(RECIPE_NAME).get();
+ this.recipes = await Promise.all(
+ result.map(async record => ({
+ ...(await this.getAttachment(record)),
+ recordKey: record.key,
+ }))
+ );
+ }
+ return this.recipes[0];
+ }
+
+ /**
+ * Grabs a slice of browse history for building a interest vector
+ */
+ async fetchHistory(columns, beginTimeSecs, endTimeSecs) {
+ let sql = `SELECT url, title, visit_count, frecency, last_visit_date, description
+ FROM moz_places
+ WHERE last_visit_date >= ${beginTimeSecs * 1000000}
+ AND last_visit_date < ${endTimeSecs * 1000000}`;
+ columns.forEach(requiredColumn => {
+ sql += ` AND IFNULL(${requiredColumn}, '') <> ''`;
+ });
+ sql += " LIMIT 30000";
+
+ const { activityStreamProvider } = lazy.NewTabUtils;
+ const history = await activityStreamProvider.executePlacesQuery(sql, {
+ columns,
+ params: {},
+ });
+
+ return history;
+ }
+
+ /**
+ * Handles setup and metrics of history fetch.
+ */
+ async getHistory() {
+ let endTimeSecs = new Date().getTime() / 1000;
+ let beginTimeSecs = endTimeSecs - this.interestConfig.history_limit_secs;
+ if (
+ !this.interestConfig ||
+ !this.interestConfig.history_required_fields ||
+ !this.interestConfig.history_required_fields.length
+ ) {
+ return [];
+ }
+ let history = await this.fetchHistory(
+ this.interestConfig.history_required_fields,
+ beginTimeSecs,
+ endTimeSecs
+ );
+
+ return history;
+ }
+
+ async setBaseAttachmentsURL() {
+ await this.personalityProviderWorker.post("setBaseAttachmentsURL", [
+ await this.baseAttachmentsURL,
+ ]);
+ }
+
+ async setInterestConfig() {
+ this.interestConfig = this.interestConfig || (await this.getRecipe());
+ await this.personalityProviderWorker.post("setInterestConfig", [
+ this.interestConfig,
+ ]);
+ }
+
+ async setInterestVector() {
+ await this.personalityProviderWorker.post("setInterestVector", [
+ this.interestVector,
+ ]);
+ }
+
+ async fetchModels() {
+ const models = await lazy.RemoteSettings(MODELS_NAME).get();
+ return this.personalityProviderWorker.post("fetchModels", [models]);
+ }
+
+ async generateTaggers() {
+ await this.personalityProviderWorker.post("generateTaggers", [
+ this.modelKeys,
+ ]);
+ }
+
+ async generateRecipeExecutor() {
+ await this.personalityProviderWorker.post("generateRecipeExecutor");
+ }
+
+ async createInterestVector() {
+ const history = await this.getHistory();
+
+ const interestVectorResult = await this.personalityProviderWorker.post(
+ "createInterestVector",
+ [history]
+ );
+
+ return interestVectorResult;
+ }
+
+ async init(callback) {
+ await this.setBaseAttachmentsURL();
+ await this.setInterestConfig();
+ if (!this.interestConfig) {
+ return;
+ }
+
+ // We always generate a recipe executor, no cache used here.
+ // This is because the result of this is an object with
+ // functions (taggers) so storing it in cache is not possible.
+ // Thus we cannot use it to rehydrate anything.
+ const fetchModelsResult = await this.fetchModels();
+ // If this fails, log an error and return.
+ if (!fetchModelsResult.ok) {
+ return;
+ }
+ await this.generateTaggers();
+ await this.generateRecipeExecutor();
+
+ // If we don't have a cached vector, create a new one.
+ if (!this.interestVector) {
+ const interestVectorResult = await this.createInterestVector();
+ // If that failed, log an error and return.
+ if (!interestVectorResult.ok) {
+ return;
+ }
+ this.interestVector = interestVectorResult.interestVector;
+ }
+
+ // This happens outside the createInterestVector call above,
+ // because create can be skipped if rehydrating from cache.
+ // In that case, the interest vector is provided and not created, so we just set it.
+ await this.setInterestVector();
+
+ this.initialized = true;
+ if (callback) {
+ callback();
+ }
+ }
+
+ async calculateItemRelevanceScore(pocketItem) {
+ if (!this.initialized) {
+ return pocketItem.item_score || 1;
+ }
+ const itemRelevanceScore = await this.personalityProviderWorker.post(
+ "calculateItemRelevanceScore",
+ [pocketItem]
+ );
+ if (!itemRelevanceScore) {
+ return -1;
+ }
+ const { scorableItem, rankingVector } = itemRelevanceScore;
+ // Put the results on the item for debugging purposes.
+ pocketItem.scorableItem = scorableItem;
+ pocketItem.rankingVector = rankingVector;
+ return rankingVector.score;
+ }
+
+ /**
+ * Returns an object holding the personalization scores of this provider instance.
+ */
+ getScores() {
+ return {
+ // We cannot return taggers here.
+ // What we return here goes into persistent cache, and taggers have functions on it.
+ // If we attempted to save taggers into persistent cache, it would store it to disk,
+ // and the next time we load it, it would start thowing function is not defined.
+ interestConfig: this.interestConfig,
+ interestVector: this.interestVector,
+ };
+ }
+}
+
+const EXPORTED_SYMBOLS = ["PersonalityProvider"];
diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js
new file mode 100644
index 0000000000..3ed118857e
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js
@@ -0,0 +1,44 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* eslint-env mozilla/chrome-worker */
+
+"use strict";
+
+// Order of these are important.
+/* import-globals-from /toolkit/components/workerloader/require.js */
+/* import-globals-from Tokenize.jsm */
+/* import-globals-from NaiveBayesTextTagger.jsm */
+/* import-globals-from NmfTextTagger.jsm */
+/* import-globals-from RecipeExecutor.jsm */
+/* import-globals-from PersonalityProviderWorkerClass.jsm */
+importScripts(
+ "resource://gre/modules/workers/require.js",
+ "resource://activity-stream/lib/PersonalityProvider/Tokenize.jsm",
+ "resource://activity-stream/lib/PersonalityProvider/NaiveBayesTextTagger.jsm",
+ "resource://activity-stream/lib/PersonalityProvider/NmfTextTagger.jsm",
+ "resource://activity-stream/lib/PersonalityProvider/RecipeExecutor.jsm",
+ "resource://activity-stream/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm"
+);
+
+const PromiseWorker = require("resource://gre/modules/workers/PromiseWorker.js");
+
+const personalityProviderWorker = new PersonalityProviderWorker();
+
+// This is boiler plate worker stuff that connects it to the main thread PromiseWorker.
+const worker = new PromiseWorker.AbstractWorker();
+worker.dispatch = function (method, args = []) {
+ return personalityProviderWorker[method](...args);
+};
+worker.postMessage = function (message, ...transfers) {
+ self.postMessage(message, ...transfers);
+};
+worker.close = function () {
+ self.close();
+};
+
+self.addEventListener("message", msg => worker.handleMessage(msg));
+self.addEventListener("unhandledrejection", function (error) {
+ throw error.reason;
+});
diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm
new file mode 100644
index 0000000000..e761f827d2
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm
@@ -0,0 +1,311 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// PersonalityProviderWorker.js imports the following scripts before this.
+/* import-globals-from Tokenize.jsm */
+/* import-globals-from NaiveBayesTextTagger.jsm */
+/* import-globals-from NmfTextTagger.jsm */
+/* import-globals-from RecipeExecutor.jsm */
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["PersonalityProviderWorker"];
+
+// A helper function to create a hash out of a file.
+async function _getFileHash(filepath) {
+ const data = await IOUtils.read(filepath);
+ // File is an instance of Uint8Array
+ const digest = await crypto.subtle.digest("SHA-256", data);
+ const uint8 = new Uint8Array(digest);
+ // return the two-digit hexadecimal code for a byte
+ const toHex = b => b.toString(16).padStart(2, "0");
+ return Array.from(uint8, toHex).join("");
+}
+
+/**
+ * V2 provider builds and ranks an interest profile (also called an “interest vector”) off the browse history.
+ * This allows Firefox to classify pages into topics, by examining the text found on the page.
+ * It does this by looking at the history text content, title, and description.
+ */
+const PersonalityProviderWorker = class PersonalityProviderWorker {
+ async getPersonalityProviderDir() {
+ const personalityProviderDir = PathUtils.join(
+ await PathUtils.getLocalProfileDir(),
+ "personality-provider"
+ );
+
+ // Cache this so we don't need to await again.
+ this.getPersonalityProviderDir = () =>
+ Promise.resolve(personalityProviderDir);
+ return personalityProviderDir;
+ }
+
+ setBaseAttachmentsURL(url) {
+ this.baseAttachmentsURL = url;
+ }
+
+ setInterestConfig(interestConfig) {
+ this.interestConfig = interestConfig;
+ }
+
+ setInterestVector(interestVector) {
+ this.interestVector = interestVector;
+ }
+
+ onSync(event) {
+ const {
+ data: { created, updated, deleted },
+ } = event;
+ // Remove every removed attachment.
+ const toRemove = deleted.concat(updated.map(u => u.old));
+ toRemove.forEach(record => this.deleteAttachment(record));
+
+ // Download every new/updated attachment.
+ const toDownload = created.concat(updated.map(u => u.new));
+ // maybeDownloadAttachment is async but we don't care inside onSync.
+ toDownload.forEach(record => this.maybeDownloadAttachment(record));
+ }
+
+ /**
+ * Attempts to download the attachment, but only if it doesn't already exist.
+ */
+ async maybeDownloadAttachment(record, retries = 3) {
+ const {
+ attachment: { filename, hash, size },
+ } = record;
+ await IOUtils.makeDirectory(await this.getPersonalityProviderDir());
+ const localFilePath = PathUtils.join(
+ await this.getPersonalityProviderDir(),
+ filename
+ );
+
+ let retry = 0;
+ while (
+ retry++ < retries &&
+ // exists is an issue for perf because I might not need to call it.
+ (!(await IOUtils.exists(localFilePath)) ||
+ (await IOUtils.stat(localFilePath)).size !== size ||
+ (await _getFileHash(localFilePath)) !== hash)
+ ) {
+ await this._downloadAttachment(record);
+ }
+ }
+
+ /**
+ * Downloads the attachment to disk assuming the dir already exists
+ * and any existing files matching the filename are clobbered.
+ */
+ async _downloadAttachment(record) {
+ const {
+ attachment: { location, filename },
+ } = record;
+ const remoteFilePath = this.baseAttachmentsURL + location;
+ const localFilePath = PathUtils.join(
+ await this.getPersonalityProviderDir(),
+ filename
+ );
+
+ const xhr = new XMLHttpRequest();
+ // Set false here for a synchronous request, because we're in a worker.
+ xhr.open("GET", remoteFilePath, false);
+ xhr.setRequestHeader("Accept-Encoding", "gzip");
+ xhr.responseType = "arraybuffer";
+ xhr.withCredentials = false;
+ xhr.send(null);
+
+ if (xhr.status !== 200) {
+ console.error(`Failed to fetch ${remoteFilePath}: ${xhr.statusText}`);
+ return;
+ }
+
+ const buffer = xhr.response;
+ const bytes = new Uint8Array(buffer);
+
+ await IOUtils.write(localFilePath, bytes, {
+ tmpPath: `${localFilePath}.tmp`,
+ });
+ }
+
+ async deleteAttachment(record) {
+ const {
+ attachment: { filename },
+ } = record;
+ await IOUtils.makeDirectory(await this.getPersonalityProviderDir());
+ const path = PathUtils.join(
+ await this.getPersonalityProviderDir(),
+ filename
+ );
+
+ await IOUtils.remove(path, { ignoreAbsent: true });
+ // Cleanup the directory if it is empty, do nothing if it is not empty.
+ try {
+ await IOUtils.remove(await this.getPersonalityProviderDir(), {
+ ignoreAbsent: true,
+ });
+ } catch (e) {
+ // This is likely because the directory is not empty, so we don't care.
+ }
+ }
+
+ /**
+ * Gets contents of the attachment if it already exists on file,
+ * and if not attempts to download it.
+ */
+ async getAttachment(record) {
+ const {
+ attachment: { filename },
+ } = record;
+ const filepath = PathUtils.join(
+ await this.getPersonalityProviderDir(),
+ filename
+ );
+
+ try {
+ await this.maybeDownloadAttachment(record);
+ return await IOUtils.readJSON(filepath);
+ } catch (error) {
+ console.error(`Failed to load ${filepath}: ${error.message}`);
+ }
+ return {};
+ }
+
+ async fetchModels(models) {
+ this.models = await Promise.all(
+ models.map(async record => ({
+ ...(await this.getAttachment(record)),
+ recordKey: record.key,
+ }))
+ );
+ if (!this.models.length) {
+ return {
+ ok: false,
+ };
+ }
+ return {
+ ok: true,
+ };
+ }
+
+ generateTaggers(modelKeys) {
+ if (!this.taggers) {
+ let nbTaggers = [];
+ let nmfTaggers = {};
+
+ for (let model of this.models) {
+ if (!modelKeys.includes(model.recordKey)) {
+ continue;
+ }
+ if (model.model_type === "nb") {
+ nbTaggers.push(new NaiveBayesTextTagger(model, toksToTfIdfVector));
+ } else if (model.model_type === "nmf") {
+ nmfTaggers[model.parent_tag] = new NmfTextTagger(
+ model,
+ toksToTfIdfVector
+ );
+ }
+ }
+ this.taggers = { nbTaggers, nmfTaggers };
+ }
+ }
+
+ /**
+ * Sets and generates a Recipe Executor.
+ * A Recipe Executor is a set of actions that can be consumed by a Recipe.
+ * The Recipe determines the order and specifics of which the actions are called.
+ */
+ generateRecipeExecutor() {
+ const recipeExecutor = new RecipeExecutor(
+ this.taggers.nbTaggers,
+ this.taggers.nmfTaggers,
+ tokenize
+ );
+ this.recipeExecutor = recipeExecutor;
+ }
+
+ /**
+ * Examines the user's browse history and returns an interest vector that
+ * describes the topics the user frequently browses.
+ */
+ createInterestVector(history) {
+ let interestVector = {};
+
+ for (let historyRec of history) {
+ let ivItem = this.recipeExecutor.executeRecipe(
+ historyRec,
+ this.interestConfig.history_item_builder
+ );
+ if (ivItem === null) {
+ continue;
+ }
+ interestVector = this.recipeExecutor.executeCombinerRecipe(
+ interestVector,
+ ivItem,
+ this.interestConfig.interest_combiner
+ );
+ if (interestVector === null) {
+ return null;
+ }
+ }
+
+ const finalResult = this.recipeExecutor.executeRecipe(
+ interestVector,
+ this.interestConfig.interest_finalizer
+ );
+
+ return {
+ ok: true,
+ interestVector: finalResult,
+ };
+ }
+
+ /**
+ * Calculates a score of a Pocket item when compared to the user's interest
+ * vector. Returns the score. Higher scores are better. Assumes this.interestVector
+ * is populated.
+ */
+ calculateItemRelevanceScore(pocketItem) {
+ const { personalization_models } = pocketItem;
+ let scorableItem;
+
+ // If the server provides some models, we can just use them,
+ // and skip generating them.
+ if (personalization_models && Object.keys(personalization_models).length) {
+ scorableItem = {
+ id: pocketItem.id,
+ item_tags: personalization_models,
+ item_score: pocketItem.item_score,
+ item_sort_id: 1,
+ };
+ } else {
+ scorableItem = this.recipeExecutor.executeRecipe(
+ pocketItem,
+ this.interestConfig.item_to_rank_builder
+ );
+ if (scorableItem === null) {
+ return null;
+ }
+ }
+
+ // We're doing a deep copy on an object.
+ let rankingVector = JSON.parse(JSON.stringify(this.interestVector));
+
+ Object.keys(scorableItem).forEach(key => {
+ rankingVector[key] = scorableItem[key];
+ });
+
+ rankingVector = this.recipeExecutor.executeRecipe(
+ rankingVector,
+ this.interestConfig.item_ranker
+ );
+
+ if (rankingVector === null) {
+ return null;
+ }
+
+ return { scorableItem, rankingVector };
+ }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm b/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm
new file mode 100644
index 0000000000..9dbf8b802d
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm
@@ -0,0 +1,1126 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["RecipeExecutor"];
+
+/**
+ * RecipeExecutor is the core feature engineering pipeline for the in-browser
+ * personalization work. These pipelines are called "recipes". A recipe is an
+ * array of objects that define a "step" in the recipe. A step is simply an
+ * object with a field "function" that specifies what is being done in the step
+ * along with other fields that are semantically defined for that step.
+ *
+ * There are two types of recipes "builder" recipes and "combiner" recipes. Builder
+ * recipes mutate an object until it matches some set of critera. Combiner
+ * recipes take two objects, (a "left" and a "right"), and specify the steps
+ * to merge the right object into the left object.
+ *
+ * A short nonsense example recipe is:
+ * [ {"function": "get_url_domain", "path_length": 1, "field": "url", "dest": "url_domain"},
+ * {"function": "nb_tag", "fields": ["title", "description"]},
+ * {"function": "conditionally_nmf_tag", "fields": ["title", "description"]} ]
+ *
+ * Recipes are sandboxed by the fact that the step functions must be explicitly
+ * allowed. Functions allowed for builder recipes are specifed in the
+ * RecipeExecutor.ITEM_BUILDER_REGISTRY, while combiner functions are allowed
+ * in RecipeExecutor.ITEM_COMBINER_REGISTRY .
+ */
+const RecipeExecutor = class RecipeExecutor {
+ constructor(nbTaggers, nmfTaggers, tokenize) {
+ this.ITEM_BUILDER_REGISTRY = {
+ nb_tag: this.naiveBayesTag,
+ conditionally_nmf_tag: this.conditionallyNmfTag,
+ accept_item_by_field_value: this.acceptItemByFieldValue,
+ tokenize_url: this.tokenizeUrl,
+ get_url_domain: this.getUrlDomain,
+ tokenize_field: this.tokenizeField,
+ copy_value: this.copyValue,
+ keep_top_k: this.keepTopK,
+ scalar_multiply: this.scalarMultiply,
+ elementwise_multiply: this.elementwiseMultiply,
+ vector_multiply: this.vectorMultiply,
+ scalar_add: this.scalarAdd,
+ vector_add: this.vectorAdd,
+ make_boolean: this.makeBoolean,
+ allow_fields: this.allowFields,
+ filter_by_value: this.filterByValue,
+ l2_normalize: this.l2Normalize,
+ prob_normalize: this.probNormalize,
+ set_default: this.setDefault,
+ lookup_value: this.lookupValue,
+ copy_to_map: this.copyToMap,
+ scalar_multiply_tag: this.scalarMultiplyTag,
+ apply_softmax_tags: this.applySoftmaxTags,
+ };
+ this.ITEM_COMBINER_REGISTRY = {
+ combiner_add: this.combinerAdd,
+ combiner_max: this.combinerMax,
+ combiner_collect_values: this.combinerCollectValues,
+ };
+ this.nbTaggers = nbTaggers;
+ this.nmfTaggers = nmfTaggers;
+ this.tokenize = tokenize;
+ }
+
+ /**
+ * Determines the type of a field. Valid types are:
+ * string
+ * number
+ * array
+ * map (strings to anything)
+ */
+ _typeOf(data) {
+ let t = typeof data;
+ if (t === "object") {
+ if (data === null) {
+ return "null";
+ }
+ if (Array.isArray(data)) {
+ return "array";
+ }
+ return "map";
+ }
+ return t;
+ }
+
+ /**
+ * Returns a scalar, either because it was a constant, or by
+ * looking it up from the item. Allows for a default value if the lookup
+ * fails.
+ */
+ _lookupScalar(item, k, dfault) {
+ if (this._typeOf(k) === "number") {
+ return k;
+ } else if (
+ this._typeOf(k) === "string" &&
+ k in item &&
+ this._typeOf(item[k]) === "number"
+ ) {
+ return item[k];
+ }
+ return dfault;
+ }
+
+ /**
+ * Simply appends all the strings from a set fields together. If the field
+ * is a list, then the cells of the list are append.
+ */
+ _assembleText(item, fields) {
+ let textArr = [];
+ for (let field of fields) {
+ if (field in item) {
+ let type = this._typeOf(item[field]);
+ if (type === "string") {
+ textArr.push(item[field]);
+ } else if (type === "array") {
+ for (let ele of item[field]) {
+ textArr.push(String(ele));
+ }
+ } else {
+ textArr.push(String(item[field]));
+ }
+ }
+ }
+ return textArr.join(" ");
+ }
+
+ /**
+ * Runs the naive bayes text taggers over a set of text fields. Stores the
+ * results in new fields:
+ * nb_tags: a map of text strings to probabilites
+ * nb_tokens: the tokenized text that was tagged
+ *
+ * Config:
+ * fields: an array containing a list of fields to concatenate and tag
+ */
+ naiveBayesTag(item, config) {
+ let text = this._assembleText(item, config.fields);
+ let tokens = this.tokenize(text);
+ let tags = {};
+ let extended_tags = {};
+
+ for (let nbTagger of this.nbTaggers) {
+ let result = nbTagger.tagTokens(tokens);
+ if (result.label !== null && result.confident) {
+ extended_tags[result.label] = result;
+ tags[result.label] = Math.exp(result.logProb);
+ }
+ }
+ item.nb_tags = tags;
+ item.nb_tags_extended = extended_tags;
+ item.nb_tokens = tokens;
+ return item;
+ }
+
+ /**
+ * Selectively runs NMF text taggers depending on which tags were found
+ * by the naive bayes taggers. Writes the results in into new fields:
+ * nmf_tags_parent_weights: map of pareent tags to probabilites of those parent tags
+ * nmf_tags: map of strings to maps of strings to probabilities
+ * nmf_tags_parent map of child tags to parent tags
+ *
+ * Config:
+ * Not configurable
+ */
+ conditionallyNmfTag(item, config) {
+ let nestedNmfTags = {};
+ let parentTags = {};
+ let parentWeights = {};
+
+ if (!("nb_tags" in item) || !("nb_tokens" in item)) {
+ return null;
+ }
+
+ Object.keys(item.nb_tags).forEach(parentTag => {
+ let nmfTagger = this.nmfTaggers[parentTag];
+ if (nmfTagger !== undefined) {
+ nestedNmfTags[parentTag] = {};
+ parentWeights[parentTag] = item.nb_tags[parentTag];
+ let nmfTags = nmfTagger.tagTokens(item.nb_tokens);
+ Object.keys(nmfTags).forEach(nmfTag => {
+ nestedNmfTags[parentTag][nmfTag] = nmfTags[nmfTag];
+ parentTags[nmfTag] = parentTag;
+ });
+ }
+ });
+
+ item.nmf_tags = nestedNmfTags;
+ item.nmf_tags_parent = parentTags;
+ item.nmf_tags_parent_weights = parentWeights;
+
+ return item;
+ }
+
+ /**
+ * Checks a field's value against another value (either from another field
+ * or a constant). If the test passes, then the item is emitted, otherwise
+ * the pipeline is aborted.
+ *
+ * Config:
+ * field Field to read the value to test. Left side of operator.
+ * op one of ==, !=, <, <=, >, >=
+ * rhsValue Constant value to compare against. Right side of operator.
+ * rhsField Field to read value to compare against. Right side of operator.
+ *
+ * NOTE: rhsValue takes precidence over rhsField.
+ */
+ acceptItemByFieldValue(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let rhs = null;
+ if ("rhsValue" in config) {
+ rhs = config.rhsValue;
+ } else if ("rhsField" in config && config.rhsField in item) {
+ rhs = item[config.rhsField];
+ }
+ if (rhs === null) {
+ return null;
+ }
+
+ if (
+ // eslint-disable-next-line eqeqeq
+ (config.op === "==" && item[config.field] == rhs) ||
+ // eslint-disable-next-line eqeqeq
+ (config.op === "!=" && item[config.field] != rhs) ||
+ (config.op === "<" && item[config.field] < rhs) ||
+ (config.op === "<=" && item[config.field] <= rhs) ||
+ (config.op === ">" && item[config.field] > rhs) ||
+ (config.op === ">=" && item[config.field] >= rhs)
+ ) {
+ return item;
+ }
+
+ return null;
+ }
+
+ /**
+ * Splits a URL into text-like tokens.
+ *
+ * Config:
+ * field Field containing a URL
+ * dest Field to write the tokens to as an array of strings
+ *
+ * NOTE: Any initial 'www' on the hostname is removed.
+ */
+ tokenizeUrl(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+
+ let url = new URL(item[config.field]);
+ let domain = url.hostname;
+ if (domain.startsWith("www.")) {
+ domain = domain.substring(4);
+ }
+ let toks = this.tokenize(domain);
+ let pathToks = this.tokenize(
+ decodeURIComponent(url.pathname.replace(/\+/g, " "))
+ );
+ for (let tok of pathToks) {
+ toks.push(tok);
+ }
+ for (let pair of url.searchParams.entries()) {
+ let k = this.tokenize(decodeURIComponent(pair[0].replace(/\+/g, " ")));
+ for (let tok of k) {
+ toks.push(tok);
+ }
+ if (pair[1] !== null && pair[1] !== "") {
+ let v = this.tokenize(decodeURIComponent(pair[1].replace(/\+/g, " ")));
+ for (let tok of v) {
+ toks.push(tok);
+ }
+ }
+ }
+ item[config.dest] = toks;
+
+ return item;
+ }
+
+ /**
+ * Gets the hostname (minus any initial "www." along with the left most
+ * directories on the path.
+ *
+ * Config:
+ * field Field containing the URL
+ * dest Field to write the array of strings to
+ * path_length OPTIONAL (DEFAULT: 0) Number of leftmost subdirectories to include
+ */
+ getUrlDomain(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+
+ let url = new URL(item[config.field]);
+ let domain = url.hostname.toLocaleLowerCase();
+ if (domain.startsWith("www.")) {
+ domain = domain.substring(4);
+ }
+ item[config.dest] = domain;
+ let pathLength = 0;
+ if ("path_length" in config) {
+ pathLength = config.path_length;
+ }
+ if (pathLength > 0) {
+ item[config.dest] += url.pathname
+ .toLocaleLowerCase()
+ .split("/")
+ .slice(0, pathLength + 1)
+ .join("/");
+ }
+
+ return item;
+ }
+
+ /**
+ * Splits a field into tokens.
+ * Config:
+ * field Field containing a string to tokenize
+ * dest Field to write the array of strings to
+ */
+ tokenizeField(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+
+ item[config.dest] = this.tokenize(item[config.field]);
+
+ return item;
+ }
+
+ /**
+ * Deep copy from one field to another.
+ * Config:
+ * src Field to read from
+ * dest Field to write to
+ */
+ copyValue(item, config) {
+ if (!(config.src in item)) {
+ return null;
+ }
+
+ item[config.dest] = JSON.parse(JSON.stringify(item[config.src]));
+
+ return item;
+ }
+
+ /**
+ * Converts a field containing a map of strings to a map of strings
+ * to numbers, to a map of strings to numbers containing at most k elements.
+ * This operation is performed by first, promoting all the subkeys up one
+ * level, and then taking the top (or bottom) k values.
+ *
+ * Config:
+ * field Points to a map of strings to a map of strings to numbers
+ * k Maximum number of items to keep
+ * descending OPTIONAL (DEFAULT: True) Sorts score in descending order
+ * (i.e. keeps maximum)
+ */
+ keepTopK(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let k = this._lookupScalar(item, config.k, 1048576);
+ let descending = !("descending" in config) || config.descending !== false;
+
+ // we can't sort by the values in the map, so we have to convert this
+ // to an array, and then sort.
+ let sortable = [];
+ Object.keys(item[config.field]).forEach(outerKey => {
+ let innerType = this._typeOf(item[config.field][outerKey]);
+ if (innerType === "map") {
+ Object.keys(item[config.field][outerKey]).forEach(innerKey => {
+ sortable.push({
+ key: innerKey,
+ value: item[config.field][outerKey][innerKey],
+ });
+ });
+ } else {
+ sortable.push({ key: outerKey, value: item[config.field][outerKey] });
+ }
+ });
+
+ sortable.sort((a, b) => {
+ if (descending) {
+ return b.value - a.value;
+ }
+ return a.value - b.value;
+ });
+
+ // now take the top k
+ let newMap = {};
+ let i = 0;
+ for (let pair of sortable) {
+ if (i >= k) {
+ break;
+ }
+ newMap[pair.key] = pair.value;
+ i++;
+ }
+ item[config.field] = newMap;
+
+ return item;
+ }
+
+ /**
+ * Scalar multiplies a vector by some constant
+ *
+ * Config:
+ * field Points to:
+ * a map of strings to numbers
+ * an array of numbers
+ * a number
+ * k Either a number, or a string. If it's a number then This
+ * is the scalar value to multiply by. If it's a string,
+ * the value in the pointed to field is used.
+ * default OPTIONAL (DEFAULT: 0), If k is a string, and no numeric
+ * value is found, then use this value.
+ */
+ scalarMultiply(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let k = this._lookupScalar(item, config.k, config.dfault);
+
+ let fieldType = this._typeOf(item[config.field]);
+ if (fieldType === "number") {
+ item[config.field] *= k;
+ } else if (fieldType === "array") {
+ for (let i = 0; i < item[config.field].length; i++) {
+ item[config.field][i] *= k;
+ }
+ } else if (fieldType === "map") {
+ Object.keys(item[config.field]).forEach(key => {
+ item[config.field][key] *= k;
+ });
+ } else {
+ return null;
+ }
+
+ return item;
+ }
+
+ /**
+ * Elementwise multiplies either two maps or two arrays together, storing
+ * the result in left. If left and right are of the same type, results in an
+ * error.
+ *
+ * Maps are special case. For maps the left must be a nested map such as:
+ * { k1: { k11: 1, k12: 2}, k2: { k21: 3, k22: 4 } } and right needs to be
+ * simple map such as: { k1: 5, k2: 6} . The operation is then to mulitply
+ * every value of every right key, to every value every subkey where the
+ * parent keys match. Using the previous examples, the result would be:
+ * { k1: { k11: 5, k12: 10 }, k2: { k21: 18, k22: 24 } } .
+ *
+ * Config:
+ * left
+ * right
+ */
+ elementwiseMultiply(item, config) {
+ if (!(config.left in item) || !(config.right in item)) {
+ return null;
+ }
+ let leftType = this._typeOf(item[config.left]);
+ if (leftType !== this._typeOf(item[config.right])) {
+ return null;
+ }
+ if (leftType === "array") {
+ if (item[config.left].length !== item[config.right].length) {
+ return null;
+ }
+ for (let i = 0; i < item[config.left].length; i++) {
+ item[config.left][i] *= item[config.right][i];
+ }
+ } else if (leftType === "map") {
+ Object.keys(item[config.left]).forEach(outerKey => {
+ let r = 0.0;
+ if (outerKey in item[config.right]) {
+ r = item[config.right][outerKey];
+ }
+ Object.keys(item[config.left][outerKey]).forEach(innerKey => {
+ item[config.left][outerKey][innerKey] *= r;
+ });
+ });
+ } else if (leftType === "number") {
+ item[config.left] *= item[config.right];
+ } else {
+ return null;
+ }
+
+ return item;
+ }
+
+ /**
+ * Vector multiplies (i.e. dot products) two vectors and stores the result in
+ * third field. Both vectors must either by maps, or arrays of numbers with
+ * the same length.
+ *
+ * Config:
+ * left A field pointing to either a map of strings to numbers,
+ * or an array of numbers
+ * right A field pointing to either a map of strings to numbers,
+ * or an array of numbers
+ * dest The field to store the dot product.
+ */
+ vectorMultiply(item, config) {
+ if (!(config.left in item) || !(config.right in item)) {
+ return null;
+ }
+
+ let leftType = this._typeOf(item[config.left]);
+ if (leftType !== this._typeOf(item[config.right])) {
+ return null;
+ }
+
+ let destVal = 0.0;
+ if (leftType === "array") {
+ if (item[config.left].length !== item[config.right].length) {
+ return null;
+ }
+ for (let i = 0; i < item[config.left].length; i++) {
+ destVal += item[config.left][i] * item[config.right][i];
+ }
+ } else if (leftType === "map") {
+ Object.keys(item[config.left]).forEach(key => {
+ if (key in item[config.right]) {
+ destVal += item[config.left][key] * item[config.right][key];
+ }
+ });
+ } else {
+ return null;
+ }
+
+ item[config.dest] = destVal;
+ return item;
+ }
+
+ /**
+ * Adds a constant value to all elements in the field. Mathematically,
+ * this is the same as taking a 1-vector, scalar multiplying it by k,
+ * and then vector adding it to a field.
+ *
+ * Config:
+ * field A field pointing to either a map of strings to numbers,
+ * or an array of numbers
+ * k Either a number, or a string. If it's a number then This
+ * is the scalar value to multiply by. If it's a string,
+ * the value in the pointed to field is used.
+ * default OPTIONAL (DEFAULT: 0), If k is a string, and no numeric
+ * value is found, then use this value.
+ */
+ scalarAdd(item, config) {
+ let k = this._lookupScalar(item, config.k, config.dfault);
+ if (!(config.field in item)) {
+ return null;
+ }
+
+ let fieldType = this._typeOf(item[config.field]);
+ if (fieldType === "array") {
+ for (let i = 0; i < item[config.field].length; i++) {
+ item[config.field][i] += k;
+ }
+ } else if (fieldType === "map") {
+ Object.keys(item[config.field]).forEach(key => {
+ item[config.field][key] += k;
+ });
+ } else if (fieldType === "number") {
+ item[config.field] += k;
+ } else {
+ return null;
+ }
+
+ return item;
+ }
+
+ /**
+ * Adds two vectors together and stores the result in left.
+ *
+ * Config:
+ * left A field pointing to either a map of strings to numbers,
+ * or an array of numbers
+ * right A field pointing to either a map of strings to numbers,
+ * or an array of numbers
+ */
+ vectorAdd(item, config) {
+ if (!(config.left in item)) {
+ return this.copyValue(item, { src: config.right, dest: config.left });
+ }
+ if (!(config.right in item)) {
+ return null;
+ }
+
+ let leftType = this._typeOf(item[config.left]);
+ if (leftType !== this._typeOf(item[config.right])) {
+ return null;
+ }
+ if (leftType === "array") {
+ if (item[config.left].length !== item[config.right].length) {
+ return null;
+ }
+ for (let i = 0; i < item[config.left].length; i++) {
+ item[config.left][i] += item[config.right][i];
+ }
+ return item;
+ } else if (leftType === "map") {
+ Object.keys(item[config.right]).forEach(key => {
+ let v = 0;
+ if (key in item[config.left]) {
+ v = item[config.left][key];
+ }
+ item[config.left][key] = v + item[config.right][key];
+ });
+ return item;
+ }
+
+ return null;
+ }
+
+ /**
+ * Converts a vector from real values to boolean integers. (i.e. either 1/0
+ * or 1/-1).
+ *
+ * Config:
+ * field Field containing either a map of strings to numbers or
+ * an array of numbers to convert.
+ * threshold OPTIONAL (DEFAULT: 0) Values above this will be replaced
+ * with 1.0. Those below will be converted to 0.
+ * keep_negative OPTIONAL (DEFAULT: False) If true, values below the
+ * threshold will be converted to -1 instead of 0.
+ */
+ makeBoolean(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let threshold = this._lookupScalar(item, config.threshold, 0.0);
+ let type = this._typeOf(item[config.field]);
+ if (type === "array") {
+ for (let i = 0; i < item[config.field].length; i++) {
+ if (item[config.field][i] > threshold) {
+ item[config.field][i] = 1.0;
+ } else if (config.keep_negative) {
+ item[config.field][i] = -1.0;
+ } else {
+ item[config.field][i] = 0.0;
+ }
+ }
+ } else if (type === "map") {
+ Object.keys(item[config.field]).forEach(key => {
+ let value = item[config.field][key];
+ if (value > threshold) {
+ item[config.field][key] = 1.0;
+ } else if (config.keep_negative) {
+ item[config.field][key] = -1.0;
+ } else {
+ item[config.field][key] = 0.0;
+ }
+ });
+ } else if (type === "number") {
+ let value = item[config.field];
+ if (value > threshold) {
+ item[config.field] = 1.0;
+ } else if (config.keep_negative) {
+ item[config.field] = -1.0;
+ } else {
+ item[config.field] = 0.0;
+ }
+ } else {
+ return null;
+ }
+
+ return item;
+ }
+
+ /**
+ * Removes all keys from the item except for the ones specified.
+ *
+ * fields An array of strings indicating the fields to keep
+ */
+ allowFields(item, config) {
+ let newItem = {};
+ for (let ele of config.fields) {
+ if (ele in item) {
+ newItem[ele] = item[ele];
+ }
+ }
+ return newItem;
+ }
+
+ /**
+ * Removes all keys whose value does not exceed some threshold.
+ *
+ * Config:
+ * field Points to a map of strings to numbers
+ * threshold Values must exceed this value, otherwise they are removed.
+ */
+ filterByValue(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let threshold = this._lookupScalar(item, config.threshold, 0.0);
+ let filtered = {};
+ Object.keys(item[config.field]).forEach(key => {
+ let value = item[config.field][key];
+ if (value > threshold) {
+ filtered[key] = value;
+ }
+ });
+ item[config.field] = filtered;
+
+ return item;
+ }
+
+ /**
+ * Rewrites a field so that its values are now L2 normed.
+ *
+ * Config:
+ * field Points to a map of strings to numbers, or an array of numbers
+ */
+ l2Normalize(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let data = item[config.field];
+ let type = this._typeOf(data);
+ if (type === "array") {
+ let norm = 0.0;
+ for (let datum of data) {
+ norm += datum * datum;
+ }
+ norm = Math.sqrt(norm);
+ if (norm !== 0) {
+ for (let i = 0; i < data.length; i++) {
+ data[i] /= norm;
+ }
+ }
+ } else if (type === "map") {
+ let norm = 0.0;
+ Object.keys(data).forEach(key => {
+ norm += data[key] * data[key];
+ });
+ norm = Math.sqrt(norm);
+ if (norm !== 0) {
+ Object.keys(data).forEach(key => {
+ data[key] /= norm;
+ });
+ }
+ } else {
+ return null;
+ }
+
+ item[config.field] = data;
+
+ return item;
+ }
+
+ /**
+ * Rewrites a field so that all of its values sum to 1.0
+ *
+ * Config:
+ * field Points to a map of strings to numbers, or an array of numbers
+ */
+ probNormalize(item, config) {
+ if (!(config.field in item)) {
+ return null;
+ }
+ let data = item[config.field];
+ let type = this._typeOf(data);
+ if (type === "array") {
+ let norm = 0.0;
+ for (let datum of data) {
+ norm += datum;
+ }
+ if (norm !== 0) {
+ for (let i = 0; i < data.length; i++) {
+ data[i] /= norm;
+ }
+ }
+ } else if (type === "map") {
+ let norm = 0.0;
+ Object.keys(item[config.field]).forEach(key => {
+ norm += item[config.field][key];
+ });
+ if (norm !== 0) {
+ Object.keys(item[config.field]).forEach(key => {
+ item[config.field][key] /= norm;
+ });
+ }
+ } else {
+ return null;
+ }
+
+ return item;
+ }
+
+ /**
+ * Stores a value, if it is not already present
+ *
+ * Config:
+ * field field to write to if it is missing
+ * value value to store in that field
+ */
+ setDefault(item, config) {
+ let val = this._lookupScalar(item, config.value, config.value);
+ if (!(config.field in item)) {
+ item[config.field] = val;
+ }
+
+ return item;
+ }
+
+ /**
+ * Selctively promotes an value from an inner map up to the outer map
+ *
+ * Config:
+ * haystack Points to a map of strings to values
+ * needle Key inside the map we should promote up
+ * dest Where we should write the value of haystack[needle]
+ */
+ lookupValue(item, config) {
+ if (config.haystack in item && config.needle in item[config.haystack]) {
+ item[config.dest] = item[config.haystack][config.needle];
+ }
+
+ return item;
+ }
+
+ /**
+ * Demotes a field into a map
+ *
+ * Config:
+ * src Field to copy
+ * dest_map Points to a map
+ * dest_key Key inside dest_map to copy src to
+ */
+ copyToMap(item, config) {
+ if (config.src in item) {
+ if (!(config.dest_map in item)) {
+ item[config.dest_map] = {};
+ }
+ item[config.dest_map][config.dest_key] = item[config.src];
+ }
+
+ return item;
+ }
+
+ /**
+ * Config:
+ * field Points to a string to number map
+ * k Scalar to multiply the values by
+ * log_scale Boolean, if true, then the values will be transformed
+ * by a logrithm prior to multiplications
+ */
+ scalarMultiplyTag(item, config) {
+ let EPSILON = 0.000001;
+ if (!(config.field in item)) {
+ return null;
+ }
+ let k = this._lookupScalar(item, config.k, 1);
+ let type = this._typeOf(item[config.field]);
+ if (type === "map") {
+ Object.keys(item[config.field]).forEach(parentKey => {
+ Object.keys(item[config.field][parentKey]).forEach(key => {
+ let v = item[config.field][parentKey][key];
+ if (config.log_scale) {
+ v = Math.log(v + EPSILON);
+ }
+ item[config.field][parentKey][key] = v * k;
+ });
+ });
+ } else {
+ return null;
+ }
+
+ return item;
+ }
+
+ /**
+ * Independently applies softmax across all subtags.
+ *
+ * Config:
+ * field Points to a map of strings with values being another map of strings
+ */
+ applySoftmaxTags(item, config) {
+ let type = this._typeOf(item[config.field]);
+ if (type !== "map") {
+ return null;
+ }
+
+ let abort = false;
+ let softmaxSum = {};
+ Object.keys(item[config.field]).forEach(tag => {
+ if (this._typeOf(item[config.field][tag]) !== "map") {
+ abort = true;
+ return;
+ }
+ if (abort) {
+ return;
+ }
+ softmaxSum[tag] = 0;
+ Object.keys(item[config.field][tag]).forEach(subtag => {
+ if (this._typeOf(item[config.field][tag][subtag]) !== "number") {
+ abort = true;
+ return;
+ }
+ let score = item[config.field][tag][subtag];
+ softmaxSum[tag] += Math.exp(score);
+ });
+ });
+ if (abort) {
+ return null;
+ }
+
+ Object.keys(item[config.field]).forEach(tag => {
+ Object.keys(item[config.field][tag]).forEach(subtag => {
+ item[config.field][tag][subtag] =
+ Math.exp(item[config.field][tag][subtag]) / softmaxSum[tag];
+ });
+ });
+
+ return item;
+ }
+
+ /**
+ * Vector adds a field and stores the result in left.
+ *
+ * Config:
+ * field The field to vector add
+ */
+ combinerAdd(left, right, config) {
+ if (!(config.field in right)) {
+ return left;
+ }
+ let type = this._typeOf(right[config.field]);
+ if (!(config.field in left)) {
+ if (type === "map") {
+ left[config.field] = {};
+ } else if (type === "array") {
+ left[config.field] = [];
+ } else if (type === "number") {
+ left[config.field] = 0;
+ } else {
+ return null;
+ }
+ }
+ if (type !== this._typeOf(left[config.field])) {
+ return null;
+ }
+ if (type === "map") {
+ Object.keys(right[config.field]).forEach(key => {
+ if (!(key in left[config.field])) {
+ left[config.field][key] = 0;
+ }
+ left[config.field][key] += right[config.field][key];
+ });
+ } else if (type === "array") {
+ for (let i = 0; i < right[config.field].length; i++) {
+ if (i < left[config.field].length) {
+ left[config.field][i] += right[config.field][i];
+ } else {
+ left[config.field].push(right[config.field][i]);
+ }
+ }
+ } else if (type === "number") {
+ left[config.field] += right[config.field];
+ } else {
+ return null;
+ }
+
+ return left;
+ }
+
+ /**
+ * Stores the maximum value of the field in left.
+ *
+ * Config:
+ * field The field to vector add
+ */
+ combinerMax(left, right, config) {
+ if (!(config.field in right)) {
+ return left;
+ }
+ let type = this._typeOf(right[config.field]);
+ if (!(config.field in left)) {
+ if (type === "map") {
+ left[config.field] = {};
+ } else if (type === "array") {
+ left[config.field] = [];
+ } else if (type === "number") {
+ left[config.field] = 0;
+ } else {
+ return null;
+ }
+ }
+ if (type !== this._typeOf(left[config.field])) {
+ return null;
+ }
+ if (type === "map") {
+ Object.keys(right[config.field]).forEach(key => {
+ if (
+ !(key in left[config.field]) ||
+ right[config.field][key] > left[config.field][key]
+ ) {
+ left[config.field][key] = right[config.field][key];
+ }
+ });
+ } else if (type === "array") {
+ for (let i = 0; i < right[config.field].length; i++) {
+ if (i < left[config.field].length) {
+ if (left[config.field][i] < right[config.field][i]) {
+ left[config.field][i] = right[config.field][i];
+ }
+ } else {
+ left[config.field].push(right[config.field][i]);
+ }
+ }
+ } else if (type === "number") {
+ if (left[config.field] < right[config.field]) {
+ left[config.field] = right[config.field];
+ }
+ } else {
+ return null;
+ }
+
+ return left;
+ }
+
+ /**
+ * Associates a value in right with another value in right. This association
+ * is then stored in a map in left.
+ *
+ * For example: If a sequence of rights is:
+ * { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 41 }
+ * { 'tags': {}, 'url_domain': 'mbusa.com/mercedes', 'time': 21 }
+ * { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 34 }
+ *
+ * Then assuming a 'sum' operation, left can build a map that would look like:
+ * {
+ * 'maseratiusa.com/maserati': 75,
+ * 'mbusa.com/mercedes': 21,
+ * }
+ *
+ * Fields:
+ * left_field field in the left to store / update the map
+ * right_key_field Field in the right to use as a key
+ * right_value_field Field in the right to use as a value
+ * operation One of "sum", "max", "overwrite", "count"
+ */
+ combinerCollectValues(left, right, config) {
+ let op;
+ if (config.operation === "sum") {
+ op = (a, b) => a + b;
+ } else if (config.operation === "max") {
+ op = (a, b) => (a > b ? a : b);
+ } else if (config.operation === "overwrite") {
+ op = (a, b) => b;
+ } else if (config.operation === "count") {
+ op = (a, b) => a + 1;
+ } else {
+ return null;
+ }
+ if (!(config.left_field in left)) {
+ left[config.left_field] = {};
+ }
+ if (
+ !(config.right_key_field in right) ||
+ !(config.right_value_field in right)
+ ) {
+ return left;
+ }
+
+ let key = right[config.right_key_field];
+ let rightValue = right[config.right_value_field];
+ let leftValue = 0.0;
+ if (key in left[config.left_field]) {
+ leftValue = left[config.left_field][key];
+ }
+
+ left[config.left_field][key] = op(leftValue, rightValue);
+
+ return left;
+ }
+
+ /**
+ * Executes a recipe. Returns an object on success, or null on failure.
+ */
+ executeRecipe(item, recipe) {
+ let newItem = item;
+ if (recipe) {
+ for (let step of recipe) {
+ let op = this.ITEM_BUILDER_REGISTRY[step.function];
+ if (op === undefined) {
+ return null;
+ }
+ newItem = op.call(this, newItem, step);
+ if (newItem === null) {
+ break;
+ }
+ }
+ }
+ return newItem;
+ }
+
+ /**
+ * Executes a recipe. Returns an object on success, or null on failure.
+ */
+ executeCombinerRecipe(item1, item2, recipe) {
+ let newItem1 = item1;
+ for (let step of recipe) {
+ let op = this.ITEM_COMBINER_REGISTRY[step.function];
+ if (op === undefined) {
+ return null;
+ }
+ newItem1 = op.call(this, newItem1, item2, step);
+ if (newItem1 === null) {
+ break;
+ }
+ }
+
+ return newItem1;
+ }
+};
diff --git a/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm b/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm
new file mode 100644
index 0000000000..94835557a6
--- /dev/null
+++ b/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm
@@ -0,0 +1,89 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"use strict";
+
+// We load this into a worker using importScripts, and in tests using import.
+// We use var to avoid name collision errors.
+// eslint-disable-next-line no-var
+var EXPORTED_SYMBOLS = ["tokenize", "toksToTfIdfVector"];
+
+// Unicode specifies certain mnemonics for code pages and character classes.
+// They call them "character properties" https://en.wikipedia.org/wiki/Unicode_character_property .
+// These mnemonics are have been adopted by many regular expression libraries,
+// however the standard Javascript regexp system doesn't support unicode
+// character properties, so we have to define these ourself.
+//
+// Each of these sections contains the characters values / ranges for specific
+// character property: Whitespace, Symbol (S), Punctuation (P), Number (N),
+// Mark (M), and Letter (L).
+const UNICODE_SPACE =
+ "\x20\xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000";
+const UNICODE_SYMBOL =
+ "\\x24\\x2B\x3C-\x3E\\x5E\x60\\x7C\x7E\xA2-\xA6\xA8\xA9\xAC\xAE-\xB1\xB4\xB8\xD7\xF7\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u0384\u0385\u03F6\u0482\u058D-\u058F\u0606-\u0608\u060B\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09F2\u09F3\u09FA\u09FB\u0AF1\u0B70\u0BF3-\u0BFA\u0C7F\u0D4F\u0D79\u0E3F\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u17DB\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u1FBD\u1FBF-\u1FC1\u1FCD-\u1FCF\u1FDD-\u1FDF\u1FED-\u1FEF\u1FFD\u1FFE\u2044\u2052\u207A-\u207C\u208A-\u208C\u20A0-\u20BE\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u2140-\u2144\u214A-\u214D\u214F\u218A\u218B\u2190-\u2307\u230C-\u2328\u232B-\u23FE\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u2767\u2794-\u27C4\u27C7-\u27E5\u27F0-\u2982\u2999-\u29D7\u29DC-\u29FB\u29FE-\u2B73\u2B76-\u2B95\u2B98-\u2BB9\u2BBD-\u2BC8\u2BCA-\u2BD1\u2BEC-\u2BEF\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u309B\u309C\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA700-\uA716\uA720\uA721\uA789\uA78A\uA828-\uA82B\uA836-\uA839\uAA77-\uAA79\uAB5B\uFB29\uFBB2-\uFBC1\uFDFC\uFDFD\uFE62\uFE64-\uFE66\uFE69\uFF04\uFF0B\uFF1C-\uFF1E\uFF3E\uFF40\uFF5C\uFF5E\uFFE0-\uFFE6\uFFE8-\uFFEE\uFFFC\uFFFD";
+const UNICODE_PUNCT =
+ "\x21-\x23\x25-\\x2A\x2C-\x2F\x3A\x3B\\x3F\x40\\x5B-\\x5D\x5F\\x7B\x7D\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E44\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65";
+
+const UNICODE_NUMBER =
+ "0-9\xB2\xB3\xB9\xBC-\xBE\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u09F4-\u09F9\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0B72-\u0B77\u0BE6-\u0BF2\u0C66-\u0C6F\u0C78-\u0C7E\u0CE6-\u0CEF\u0D58-\u0D5E\u0D66-\u0D78\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F33\u1040-\u1049\u1090-\u1099\u1369-\u137C\u16EE-\u16F0\u17E0-\u17E9\u17F0-\u17F9\u1810-\u1819\u1946-\u194F\u19D0-\u19DA\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\u2070\u2074-\u2079\u2080-\u2089\u2150-\u2182\u2185-\u2189\u2460-\u249B\u24EA-\u24FF\u2776-\u2793\u2CFD\u3007\u3021-\u3029\u3038-\u303A\u3192-\u3195\u3220-\u3229\u3248-\u324F\u3251-\u325F\u3280-\u3289\u32B1-\u32BF\uA620-\uA629\uA6E6-\uA6EF\uA830-\uA835\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19";
+const UNICODE_MARK =
+ "\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08D4-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D01-\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ABE\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2-\u1CF4\u1CF8\u1CF9\u1DC0-\u1DF5\u1DFB-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F";
+const UNICODE_LETTER =
+ "A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0-\u08B4\u08B6-\u08BD\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1877\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1C80-\u1C88\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FD5\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA7AE\uA7B0-\uA7B7\uA7F7-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uA9E0-\uA9E4\uA9E6-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB65\uAB70-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC";
+
+const REGEXP_SPLITS = new RegExp(
+ `[${UNICODE_SPACE}${UNICODE_SYMBOL}${UNICODE_PUNCT}]+`
+);
+// Match all token characters, so okay for regex to split multiple code points
+// eslint-disable-next-line no-misleading-character-class
+const REGEXP_ALPHANUMS = new RegExp(
+ `^[${UNICODE_NUMBER}${UNICODE_MARK}${UNICODE_LETTER}]+$`
+);
+
+/**
+ * Downcases the text, and splits it into consecutive alphanumeric characters.
+ * This is locale aware, and so will not strip accents. This uses "word
+ * breaks", and os is not appropriate for languages without them
+ * (e.g. Chinese).
+ */
+function tokenize(text) {
+ return text
+ .toLocaleLowerCase()
+ .split(REGEXP_SPLITS)
+ .filter(tok => tok.match(REGEXP_ALPHANUMS));
+}
+
+/**
+ * Converts a sequence of tokens into an L2 normed TF-IDF. Any terms that are
+ * not preindexed (i.e. does have a computed inverse document frequency) will
+ * be dropped.
+ */
+function toksToTfIdfVector(tokens, vocab_idfs) {
+ let tfidfs = {};
+
+ // calcualte the term frequencies
+ for (let tok of tokens) {
+ if (!(tok in vocab_idfs)) {
+ continue;
+ }
+ if (!(tok in tfidfs)) {
+ tfidfs[tok] = [vocab_idfs[tok][0], 1];
+ } else {
+ tfidfs[tok][1]++;
+ }
+ }
+
+ // now multiply by the log inverse document frequencies, then take
+ // the L2 norm of this.
+ let l2Norm = 0.0;
+ Object.keys(tfidfs).forEach(tok => {
+ tfidfs[tok][1] *= vocab_idfs[tok][1];
+ l2Norm += tfidfs[tok][1] * tfidfs[tok][1];
+ });
+ l2Norm = Math.sqrt(l2Norm);
+ Object.keys(tfidfs).forEach(tok => {
+ tfidfs[tok][1] /= l2Norm;
+ });
+
+ return tfidfs;
+}