diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /browser/components/newtab/lib/PersonalityProvider | |
parent | Initial commit. (diff) | |
download | firefox-esr-upstream.tar.xz firefox-esr-upstream.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'browser/components/newtab/lib/PersonalityProvider')
7 files changed, 1984 insertions, 0 deletions
diff --git a/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm b/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm new file mode 100644 index 0000000000..cc625076ba --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/NaiveBayesTextTagger.jsm @@ -0,0 +1,67 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +// We load this into a worker using importScripts, and in tests using import. +// We use var to avoid name collision errors. +// eslint-disable-next-line no-var +var EXPORTED_SYMBOLS = ["NaiveBayesTextTagger"]; + +const NaiveBayesTextTagger = class NaiveBayesTextTagger { + constructor(model, toksToTfIdfVector) { + this.model = model; + this.toksToTfIdfVector = toksToTfIdfVector; + } + + /** + * Determines if the tokenized text belongs to class according to binary naive Bayes + * classifier. Returns an object containing the class label ("label"), and + * the log probability ("logProb") that the text belongs to that class. If + * the positive class is more likely, then "label" is the positive class + * label. If the negative class is matched, then "label" is set to null. + */ + tagTokens(tokens) { + let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs); + + let bestLogProb = null; + let bestClassId = -1; + let bestClassLabel = null; + let logSumExp = 0.0; // will be P(x). Used to create a proper probability + for (let classId = 0; classId < this.model.classes.length; classId++) { + let classModel = this.model.classes[classId]; + let classLogProb = classModel.log_prior; + + // dot fv with the class model + for (let pair of Object.values(fv)) { + let [termId, tfidf] = pair; + classLogProb += tfidf * classModel.feature_log_probs[termId]; + } + + if (bestLogProb === null || classLogProb > bestLogProb) { + bestLogProb = classLogProb; + bestClassId = classId; + } + logSumExp += Math.exp(classLogProb); + } + + // now normalize the probability by dividing by P(x) + logSumExp = Math.log(logSumExp); + bestLogProb -= logSumExp; + if (bestClassId === this.model.positive_class_id) { + bestClassLabel = this.model.positive_class_label; + } else { + bestClassLabel = null; + } + + let confident = + bestClassId === this.model.positive_class_id && + bestLogProb > this.model.positive_class_threshold_log_prob; + return { + label: bestClassLabel, + logProb: bestLogProb, + confident, + }; + } +}; diff --git a/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm b/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm new file mode 100644 index 0000000000..639c92b6e4 --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/NmfTextTagger.jsm @@ -0,0 +1,65 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +// We load this into a worker using importScripts, and in tests using import. +// We use var to avoid name collision errors. +// eslint-disable-next-line no-var +var EXPORTED_SYMBOLS = ["NmfTextTagger"]; + +const NmfTextTagger = class NmfTextTagger { + constructor(model, toksToTfIdfVector) { + this.model = model; + this.toksToTfIdfVector = toksToTfIdfVector; + } + + /** + * A multiclass classifier that scores tokenized text against several classes through + * inference of a nonnegative matrix factorization of TF-IDF vectors and + * class labels. Returns a map of class labels as string keys to scores. + * (Higher is more confident.) All classes get scored, so it is up to + * consumer of this data determine what classes are most valuable. + */ + tagTokens(tokens) { + let fv = this.toksToTfIdfVector(tokens, this.model.vocab_idfs); + let fve = Object.values(fv); + + // normalize by the sum of the vector + let sum = 0.0; + for (let pair of fve) { + // eslint-disable-next-line prefer-destructuring + sum += pair[1]; + } + for (let i = 0; i < fve.length; i++) { + // eslint-disable-next-line prefer-destructuring + fve[i][1] /= sum; + } + + // dot the document with each topic vector so that we can transform it into + // the latent space + let toksInLatentSpace = []; + for (let topicVect of this.model.topic_word) { + let fvDotTwv = 0; + // dot fv with each topic word vector + for (let pair of fve) { + let [termId, tfidf] = pair; + fvDotTwv += tfidf * topicVect[termId]; + } + toksInLatentSpace.push(fvDotTwv); + } + + // now project toksInLatentSpace back into class space + let predictions = {}; + Object.keys(this.model.document_topic).forEach(topic => { + let score = 0; + for (let i = 0; i < toksInLatentSpace.length; i++) { + score += toksInLatentSpace[i] * this.model.document_topic[topic][i]; + } + predictions[topic] = score; + }); + + return predictions; + } +}; diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm b/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm new file mode 100644 index 0000000000..c1f54408f2 --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProvider.jsm @@ -0,0 +1,282 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + NewTabUtils: "resource://gre/modules/NewTabUtils.sys.mjs", + RemoteSettings: "resource://services-settings/remote-settings.sys.mjs", + Utils: "resource://services-settings/Utils.sys.mjs", +}); + +const { BasePromiseWorker } = ChromeUtils.importESModule( + "resource://gre/modules/PromiseWorker.sys.mjs" +); + +const RECIPE_NAME = "personality-provider-recipe"; +const MODELS_NAME = "personality-provider-models"; + +class PersonalityProvider { + constructor(modelKeys) { + this.modelKeys = modelKeys; + this.onSync = this.onSync.bind(this); + this.setup(); + } + + setScores(scores) { + this.scores = scores || {}; + this.interestConfig = this.scores.interestConfig; + this.interestVector = this.scores.interestVector; + } + + get personalityProviderWorker() { + if (this._personalityProviderWorker) { + return this._personalityProviderWorker; + } + + this._personalityProviderWorker = new BasePromiseWorker( + "resource://activity-stream/lib/PersonalityProvider/PersonalityProviderWorker.js" + ); + + return this._personalityProviderWorker; + } + + get baseAttachmentsURL() { + // Returning a promise, so we can have an async getter. + return this._getBaseAttachmentsURL(); + } + + async _getBaseAttachmentsURL() { + if (this._baseAttachmentsURL) { + return this._baseAttachmentsURL; + } + const server = lazy.Utils.SERVER_URL; + const serverInfo = await ( + await fetch(`${server}/`, { + credentials: "omit", + }) + ).json(); + const { + capabilities: { + attachments: { base_url }, + }, + } = serverInfo; + this._baseAttachmentsURL = base_url; + return this._baseAttachmentsURL; + } + + setup() { + this.setupSyncAttachment(RECIPE_NAME); + this.setupSyncAttachment(MODELS_NAME); + } + + teardown() { + this.teardownSyncAttachment(RECIPE_NAME); + this.teardownSyncAttachment(MODELS_NAME); + if (this._personalityProviderWorker) { + this._personalityProviderWorker.terminate(); + } + } + + setupSyncAttachment(collection) { + lazy.RemoteSettings(collection).on("sync", this.onSync); + } + + teardownSyncAttachment(collection) { + lazy.RemoteSettings(collection).off("sync", this.onSync); + } + + onSync(event) { + this.personalityProviderWorker.post("onSync", [event]); + } + + /** + * Gets contents of the attachment if it already exists on file, + * and if not attempts to download it. + */ + getAttachment(record) { + return this.personalityProviderWorker.post("getAttachment", [record]); + } + + /** + * Returns a Recipe from remote settings to be consumed by a RecipeExecutor. + * A Recipe is a set of instructions on how to processes a RecipeExecutor. + */ + async getRecipe() { + if (!this.recipes || !this.recipes.length) { + const result = await lazy.RemoteSettings(RECIPE_NAME).get(); + this.recipes = await Promise.all( + result.map(async record => ({ + ...(await this.getAttachment(record)), + recordKey: record.key, + })) + ); + } + return this.recipes[0]; + } + + /** + * Grabs a slice of browse history for building a interest vector + */ + async fetchHistory(columns, beginTimeSecs, endTimeSecs) { + let sql = `SELECT url, title, visit_count, frecency, last_visit_date, description + FROM moz_places + WHERE last_visit_date >= ${beginTimeSecs * 1000000} + AND last_visit_date < ${endTimeSecs * 1000000}`; + columns.forEach(requiredColumn => { + sql += ` AND IFNULL(${requiredColumn}, '') <> ''`; + }); + sql += " LIMIT 30000"; + + const { activityStreamProvider } = lazy.NewTabUtils; + const history = await activityStreamProvider.executePlacesQuery(sql, { + columns, + params: {}, + }); + + return history; + } + + /** + * Handles setup and metrics of history fetch. + */ + async getHistory() { + let endTimeSecs = new Date().getTime() / 1000; + let beginTimeSecs = endTimeSecs - this.interestConfig.history_limit_secs; + if ( + !this.interestConfig || + !this.interestConfig.history_required_fields || + !this.interestConfig.history_required_fields.length + ) { + return []; + } + let history = await this.fetchHistory( + this.interestConfig.history_required_fields, + beginTimeSecs, + endTimeSecs + ); + + return history; + } + + async setBaseAttachmentsURL() { + await this.personalityProviderWorker.post("setBaseAttachmentsURL", [ + await this.baseAttachmentsURL, + ]); + } + + async setInterestConfig() { + this.interestConfig = this.interestConfig || (await this.getRecipe()); + await this.personalityProviderWorker.post("setInterestConfig", [ + this.interestConfig, + ]); + } + + async setInterestVector() { + await this.personalityProviderWorker.post("setInterestVector", [ + this.interestVector, + ]); + } + + async fetchModels() { + const models = await lazy.RemoteSettings(MODELS_NAME).get(); + return this.personalityProviderWorker.post("fetchModels", [models]); + } + + async generateTaggers() { + await this.personalityProviderWorker.post("generateTaggers", [ + this.modelKeys, + ]); + } + + async generateRecipeExecutor() { + await this.personalityProviderWorker.post("generateRecipeExecutor"); + } + + async createInterestVector() { + const history = await this.getHistory(); + + const interestVectorResult = await this.personalityProviderWorker.post( + "createInterestVector", + [history] + ); + + return interestVectorResult; + } + + async init(callback) { + await this.setBaseAttachmentsURL(); + await this.setInterestConfig(); + if (!this.interestConfig) { + return; + } + + // We always generate a recipe executor, no cache used here. + // This is because the result of this is an object with + // functions (taggers) so storing it in cache is not possible. + // Thus we cannot use it to rehydrate anything. + const fetchModelsResult = await this.fetchModels(); + // If this fails, log an error and return. + if (!fetchModelsResult.ok) { + return; + } + await this.generateTaggers(); + await this.generateRecipeExecutor(); + + // If we don't have a cached vector, create a new one. + if (!this.interestVector) { + const interestVectorResult = await this.createInterestVector(); + // If that failed, log an error and return. + if (!interestVectorResult.ok) { + return; + } + this.interestVector = interestVectorResult.interestVector; + } + + // This happens outside the createInterestVector call above, + // because create can be skipped if rehydrating from cache. + // In that case, the interest vector is provided and not created, so we just set it. + await this.setInterestVector(); + + this.initialized = true; + if (callback) { + callback(); + } + } + + async calculateItemRelevanceScore(pocketItem) { + if (!this.initialized) { + return pocketItem.item_score || 1; + } + const itemRelevanceScore = await this.personalityProviderWorker.post( + "calculateItemRelevanceScore", + [pocketItem] + ); + if (!itemRelevanceScore) { + return -1; + } + const { scorableItem, rankingVector } = itemRelevanceScore; + // Put the results on the item for debugging purposes. + pocketItem.scorableItem = scorableItem; + pocketItem.rankingVector = rankingVector; + return rankingVector.score; + } + + /** + * Returns an object holding the personalization scores of this provider instance. + */ + getScores() { + return { + // We cannot return taggers here. + // What we return here goes into persistent cache, and taggers have functions on it. + // If we attempted to save taggers into persistent cache, it would store it to disk, + // and the next time we load it, it would start thowing function is not defined. + interestConfig: this.interestConfig, + interestVector: this.interestVector, + }; + } +} + +const EXPORTED_SYMBOLS = ["PersonalityProvider"]; diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js new file mode 100644 index 0000000000..3ed118857e --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorker.js @@ -0,0 +1,44 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* eslint-env mozilla/chrome-worker */ + +"use strict"; + +// Order of these are important. +/* import-globals-from /toolkit/components/workerloader/require.js */ +/* import-globals-from Tokenize.jsm */ +/* import-globals-from NaiveBayesTextTagger.jsm */ +/* import-globals-from NmfTextTagger.jsm */ +/* import-globals-from RecipeExecutor.jsm */ +/* import-globals-from PersonalityProviderWorkerClass.jsm */ +importScripts( + "resource://gre/modules/workers/require.js", + "resource://activity-stream/lib/PersonalityProvider/Tokenize.jsm", + "resource://activity-stream/lib/PersonalityProvider/NaiveBayesTextTagger.jsm", + "resource://activity-stream/lib/PersonalityProvider/NmfTextTagger.jsm", + "resource://activity-stream/lib/PersonalityProvider/RecipeExecutor.jsm", + "resource://activity-stream/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm" +); + +const PromiseWorker = require("resource://gre/modules/workers/PromiseWorker.js"); + +const personalityProviderWorker = new PersonalityProviderWorker(); + +// This is boiler plate worker stuff that connects it to the main thread PromiseWorker. +const worker = new PromiseWorker.AbstractWorker(); +worker.dispatch = function (method, args = []) { + return personalityProviderWorker[method](...args); +}; +worker.postMessage = function (message, ...transfers) { + self.postMessage(message, ...transfers); +}; +worker.close = function () { + self.close(); +}; + +self.addEventListener("message", msg => worker.handleMessage(msg)); +self.addEventListener("unhandledrejection", function (error) { + throw error.reason; +}); diff --git a/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm new file mode 100644 index 0000000000..e761f827d2 --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/PersonalityProviderWorkerClass.jsm @@ -0,0 +1,311 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +// PersonalityProviderWorker.js imports the following scripts before this. +/* import-globals-from Tokenize.jsm */ +/* import-globals-from NaiveBayesTextTagger.jsm */ +/* import-globals-from NmfTextTagger.jsm */ +/* import-globals-from RecipeExecutor.jsm */ + +// We load this into a worker using importScripts, and in tests using import. +// We use var to avoid name collision errors. +// eslint-disable-next-line no-var +var EXPORTED_SYMBOLS = ["PersonalityProviderWorker"]; + +// A helper function to create a hash out of a file. +async function _getFileHash(filepath) { + const data = await IOUtils.read(filepath); + // File is an instance of Uint8Array + const digest = await crypto.subtle.digest("SHA-256", data); + const uint8 = new Uint8Array(digest); + // return the two-digit hexadecimal code for a byte + const toHex = b => b.toString(16).padStart(2, "0"); + return Array.from(uint8, toHex).join(""); +} + +/** + * V2 provider builds and ranks an interest profile (also called an “interest vector”) off the browse history. + * This allows Firefox to classify pages into topics, by examining the text found on the page. + * It does this by looking at the history text content, title, and description. + */ +const PersonalityProviderWorker = class PersonalityProviderWorker { + async getPersonalityProviderDir() { + const personalityProviderDir = PathUtils.join( + await PathUtils.getLocalProfileDir(), + "personality-provider" + ); + + // Cache this so we don't need to await again. + this.getPersonalityProviderDir = () => + Promise.resolve(personalityProviderDir); + return personalityProviderDir; + } + + setBaseAttachmentsURL(url) { + this.baseAttachmentsURL = url; + } + + setInterestConfig(interestConfig) { + this.interestConfig = interestConfig; + } + + setInterestVector(interestVector) { + this.interestVector = interestVector; + } + + onSync(event) { + const { + data: { created, updated, deleted }, + } = event; + // Remove every removed attachment. + const toRemove = deleted.concat(updated.map(u => u.old)); + toRemove.forEach(record => this.deleteAttachment(record)); + + // Download every new/updated attachment. + const toDownload = created.concat(updated.map(u => u.new)); + // maybeDownloadAttachment is async but we don't care inside onSync. + toDownload.forEach(record => this.maybeDownloadAttachment(record)); + } + + /** + * Attempts to download the attachment, but only if it doesn't already exist. + */ + async maybeDownloadAttachment(record, retries = 3) { + const { + attachment: { filename, hash, size }, + } = record; + await IOUtils.makeDirectory(await this.getPersonalityProviderDir()); + const localFilePath = PathUtils.join( + await this.getPersonalityProviderDir(), + filename + ); + + let retry = 0; + while ( + retry++ < retries && + // exists is an issue for perf because I might not need to call it. + (!(await IOUtils.exists(localFilePath)) || + (await IOUtils.stat(localFilePath)).size !== size || + (await _getFileHash(localFilePath)) !== hash) + ) { + await this._downloadAttachment(record); + } + } + + /** + * Downloads the attachment to disk assuming the dir already exists + * and any existing files matching the filename are clobbered. + */ + async _downloadAttachment(record) { + const { + attachment: { location, filename }, + } = record; + const remoteFilePath = this.baseAttachmentsURL + location; + const localFilePath = PathUtils.join( + await this.getPersonalityProviderDir(), + filename + ); + + const xhr = new XMLHttpRequest(); + // Set false here for a synchronous request, because we're in a worker. + xhr.open("GET", remoteFilePath, false); + xhr.setRequestHeader("Accept-Encoding", "gzip"); + xhr.responseType = "arraybuffer"; + xhr.withCredentials = false; + xhr.send(null); + + if (xhr.status !== 200) { + console.error(`Failed to fetch ${remoteFilePath}: ${xhr.statusText}`); + return; + } + + const buffer = xhr.response; + const bytes = new Uint8Array(buffer); + + await IOUtils.write(localFilePath, bytes, { + tmpPath: `${localFilePath}.tmp`, + }); + } + + async deleteAttachment(record) { + const { + attachment: { filename }, + } = record; + await IOUtils.makeDirectory(await this.getPersonalityProviderDir()); + const path = PathUtils.join( + await this.getPersonalityProviderDir(), + filename + ); + + await IOUtils.remove(path, { ignoreAbsent: true }); + // Cleanup the directory if it is empty, do nothing if it is not empty. + try { + await IOUtils.remove(await this.getPersonalityProviderDir(), { + ignoreAbsent: true, + }); + } catch (e) { + // This is likely because the directory is not empty, so we don't care. + } + } + + /** + * Gets contents of the attachment if it already exists on file, + * and if not attempts to download it. + */ + async getAttachment(record) { + const { + attachment: { filename }, + } = record; + const filepath = PathUtils.join( + await this.getPersonalityProviderDir(), + filename + ); + + try { + await this.maybeDownloadAttachment(record); + return await IOUtils.readJSON(filepath); + } catch (error) { + console.error(`Failed to load ${filepath}: ${error.message}`); + } + return {}; + } + + async fetchModels(models) { + this.models = await Promise.all( + models.map(async record => ({ + ...(await this.getAttachment(record)), + recordKey: record.key, + })) + ); + if (!this.models.length) { + return { + ok: false, + }; + } + return { + ok: true, + }; + } + + generateTaggers(modelKeys) { + if (!this.taggers) { + let nbTaggers = []; + let nmfTaggers = {}; + + for (let model of this.models) { + if (!modelKeys.includes(model.recordKey)) { + continue; + } + if (model.model_type === "nb") { + nbTaggers.push(new NaiveBayesTextTagger(model, toksToTfIdfVector)); + } else if (model.model_type === "nmf") { + nmfTaggers[model.parent_tag] = new NmfTextTagger( + model, + toksToTfIdfVector + ); + } + } + this.taggers = { nbTaggers, nmfTaggers }; + } + } + + /** + * Sets and generates a Recipe Executor. + * A Recipe Executor is a set of actions that can be consumed by a Recipe. + * The Recipe determines the order and specifics of which the actions are called. + */ + generateRecipeExecutor() { + const recipeExecutor = new RecipeExecutor( + this.taggers.nbTaggers, + this.taggers.nmfTaggers, + tokenize + ); + this.recipeExecutor = recipeExecutor; + } + + /** + * Examines the user's browse history and returns an interest vector that + * describes the topics the user frequently browses. + */ + createInterestVector(history) { + let interestVector = {}; + + for (let historyRec of history) { + let ivItem = this.recipeExecutor.executeRecipe( + historyRec, + this.interestConfig.history_item_builder + ); + if (ivItem === null) { + continue; + } + interestVector = this.recipeExecutor.executeCombinerRecipe( + interestVector, + ivItem, + this.interestConfig.interest_combiner + ); + if (interestVector === null) { + return null; + } + } + + const finalResult = this.recipeExecutor.executeRecipe( + interestVector, + this.interestConfig.interest_finalizer + ); + + return { + ok: true, + interestVector: finalResult, + }; + } + + /** + * Calculates a score of a Pocket item when compared to the user's interest + * vector. Returns the score. Higher scores are better. Assumes this.interestVector + * is populated. + */ + calculateItemRelevanceScore(pocketItem) { + const { personalization_models } = pocketItem; + let scorableItem; + + // If the server provides some models, we can just use them, + // and skip generating them. + if (personalization_models && Object.keys(personalization_models).length) { + scorableItem = { + id: pocketItem.id, + item_tags: personalization_models, + item_score: pocketItem.item_score, + item_sort_id: 1, + }; + } else { + scorableItem = this.recipeExecutor.executeRecipe( + pocketItem, + this.interestConfig.item_to_rank_builder + ); + if (scorableItem === null) { + return null; + } + } + + // We're doing a deep copy on an object. + let rankingVector = JSON.parse(JSON.stringify(this.interestVector)); + + Object.keys(scorableItem).forEach(key => { + rankingVector[key] = scorableItem[key]; + }); + + rankingVector = this.recipeExecutor.executeRecipe( + rankingVector, + this.interestConfig.item_ranker + ); + + if (rankingVector === null) { + return null; + } + + return { scorableItem, rankingVector }; + } +}; diff --git a/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm b/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm new file mode 100644 index 0000000000..9dbf8b802d --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/RecipeExecutor.jsm @@ -0,0 +1,1126 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +// We load this into a worker using importScripts, and in tests using import. +// We use var to avoid name collision errors. +// eslint-disable-next-line no-var +var EXPORTED_SYMBOLS = ["RecipeExecutor"]; + +/** + * RecipeExecutor is the core feature engineering pipeline for the in-browser + * personalization work. These pipelines are called "recipes". A recipe is an + * array of objects that define a "step" in the recipe. A step is simply an + * object with a field "function" that specifies what is being done in the step + * along with other fields that are semantically defined for that step. + * + * There are two types of recipes "builder" recipes and "combiner" recipes. Builder + * recipes mutate an object until it matches some set of critera. Combiner + * recipes take two objects, (a "left" and a "right"), and specify the steps + * to merge the right object into the left object. + * + * A short nonsense example recipe is: + * [ {"function": "get_url_domain", "path_length": 1, "field": "url", "dest": "url_domain"}, + * {"function": "nb_tag", "fields": ["title", "description"]}, + * {"function": "conditionally_nmf_tag", "fields": ["title", "description"]} ] + * + * Recipes are sandboxed by the fact that the step functions must be explicitly + * allowed. Functions allowed for builder recipes are specifed in the + * RecipeExecutor.ITEM_BUILDER_REGISTRY, while combiner functions are allowed + * in RecipeExecutor.ITEM_COMBINER_REGISTRY . + */ +const RecipeExecutor = class RecipeExecutor { + constructor(nbTaggers, nmfTaggers, tokenize) { + this.ITEM_BUILDER_REGISTRY = { + nb_tag: this.naiveBayesTag, + conditionally_nmf_tag: this.conditionallyNmfTag, + accept_item_by_field_value: this.acceptItemByFieldValue, + tokenize_url: this.tokenizeUrl, + get_url_domain: this.getUrlDomain, + tokenize_field: this.tokenizeField, + copy_value: this.copyValue, + keep_top_k: this.keepTopK, + scalar_multiply: this.scalarMultiply, + elementwise_multiply: this.elementwiseMultiply, + vector_multiply: this.vectorMultiply, + scalar_add: this.scalarAdd, + vector_add: this.vectorAdd, + make_boolean: this.makeBoolean, + allow_fields: this.allowFields, + filter_by_value: this.filterByValue, + l2_normalize: this.l2Normalize, + prob_normalize: this.probNormalize, + set_default: this.setDefault, + lookup_value: this.lookupValue, + copy_to_map: this.copyToMap, + scalar_multiply_tag: this.scalarMultiplyTag, + apply_softmax_tags: this.applySoftmaxTags, + }; + this.ITEM_COMBINER_REGISTRY = { + combiner_add: this.combinerAdd, + combiner_max: this.combinerMax, + combiner_collect_values: this.combinerCollectValues, + }; + this.nbTaggers = nbTaggers; + this.nmfTaggers = nmfTaggers; + this.tokenize = tokenize; + } + + /** + * Determines the type of a field. Valid types are: + * string + * number + * array + * map (strings to anything) + */ + _typeOf(data) { + let t = typeof data; + if (t === "object") { + if (data === null) { + return "null"; + } + if (Array.isArray(data)) { + return "array"; + } + return "map"; + } + return t; + } + + /** + * Returns a scalar, either because it was a constant, or by + * looking it up from the item. Allows for a default value if the lookup + * fails. + */ + _lookupScalar(item, k, dfault) { + if (this._typeOf(k) === "number") { + return k; + } else if ( + this._typeOf(k) === "string" && + k in item && + this._typeOf(item[k]) === "number" + ) { + return item[k]; + } + return dfault; + } + + /** + * Simply appends all the strings from a set fields together. If the field + * is a list, then the cells of the list are append. + */ + _assembleText(item, fields) { + let textArr = []; + for (let field of fields) { + if (field in item) { + let type = this._typeOf(item[field]); + if (type === "string") { + textArr.push(item[field]); + } else if (type === "array") { + for (let ele of item[field]) { + textArr.push(String(ele)); + } + } else { + textArr.push(String(item[field])); + } + } + } + return textArr.join(" "); + } + + /** + * Runs the naive bayes text taggers over a set of text fields. Stores the + * results in new fields: + * nb_tags: a map of text strings to probabilites + * nb_tokens: the tokenized text that was tagged + * + * Config: + * fields: an array containing a list of fields to concatenate and tag + */ + naiveBayesTag(item, config) { + let text = this._assembleText(item, config.fields); + let tokens = this.tokenize(text); + let tags = {}; + let extended_tags = {}; + + for (let nbTagger of this.nbTaggers) { + let result = nbTagger.tagTokens(tokens); + if (result.label !== null && result.confident) { + extended_tags[result.label] = result; + tags[result.label] = Math.exp(result.logProb); + } + } + item.nb_tags = tags; + item.nb_tags_extended = extended_tags; + item.nb_tokens = tokens; + return item; + } + + /** + * Selectively runs NMF text taggers depending on which tags were found + * by the naive bayes taggers. Writes the results in into new fields: + * nmf_tags_parent_weights: map of pareent tags to probabilites of those parent tags + * nmf_tags: map of strings to maps of strings to probabilities + * nmf_tags_parent map of child tags to parent tags + * + * Config: + * Not configurable + */ + conditionallyNmfTag(item, config) { + let nestedNmfTags = {}; + let parentTags = {}; + let parentWeights = {}; + + if (!("nb_tags" in item) || !("nb_tokens" in item)) { + return null; + } + + Object.keys(item.nb_tags).forEach(parentTag => { + let nmfTagger = this.nmfTaggers[parentTag]; + if (nmfTagger !== undefined) { + nestedNmfTags[parentTag] = {}; + parentWeights[parentTag] = item.nb_tags[parentTag]; + let nmfTags = nmfTagger.tagTokens(item.nb_tokens); + Object.keys(nmfTags).forEach(nmfTag => { + nestedNmfTags[parentTag][nmfTag] = nmfTags[nmfTag]; + parentTags[nmfTag] = parentTag; + }); + } + }); + + item.nmf_tags = nestedNmfTags; + item.nmf_tags_parent = parentTags; + item.nmf_tags_parent_weights = parentWeights; + + return item; + } + + /** + * Checks a field's value against another value (either from another field + * or a constant). If the test passes, then the item is emitted, otherwise + * the pipeline is aborted. + * + * Config: + * field Field to read the value to test. Left side of operator. + * op one of ==, !=, <, <=, >, >= + * rhsValue Constant value to compare against. Right side of operator. + * rhsField Field to read value to compare against. Right side of operator. + * + * NOTE: rhsValue takes precidence over rhsField. + */ + acceptItemByFieldValue(item, config) { + if (!(config.field in item)) { + return null; + } + let rhs = null; + if ("rhsValue" in config) { + rhs = config.rhsValue; + } else if ("rhsField" in config && config.rhsField in item) { + rhs = item[config.rhsField]; + } + if (rhs === null) { + return null; + } + + if ( + // eslint-disable-next-line eqeqeq + (config.op === "==" && item[config.field] == rhs) || + // eslint-disable-next-line eqeqeq + (config.op === "!=" && item[config.field] != rhs) || + (config.op === "<" && item[config.field] < rhs) || + (config.op === "<=" && item[config.field] <= rhs) || + (config.op === ">" && item[config.field] > rhs) || + (config.op === ">=" && item[config.field] >= rhs) + ) { + return item; + } + + return null; + } + + /** + * Splits a URL into text-like tokens. + * + * Config: + * field Field containing a URL + * dest Field to write the tokens to as an array of strings + * + * NOTE: Any initial 'www' on the hostname is removed. + */ + tokenizeUrl(item, config) { + if (!(config.field in item)) { + return null; + } + + let url = new URL(item[config.field]); + let domain = url.hostname; + if (domain.startsWith("www.")) { + domain = domain.substring(4); + } + let toks = this.tokenize(domain); + let pathToks = this.tokenize( + decodeURIComponent(url.pathname.replace(/\+/g, " ")) + ); + for (let tok of pathToks) { + toks.push(tok); + } + for (let pair of url.searchParams.entries()) { + let k = this.tokenize(decodeURIComponent(pair[0].replace(/\+/g, " "))); + for (let tok of k) { + toks.push(tok); + } + if (pair[1] !== null && pair[1] !== "") { + let v = this.tokenize(decodeURIComponent(pair[1].replace(/\+/g, " "))); + for (let tok of v) { + toks.push(tok); + } + } + } + item[config.dest] = toks; + + return item; + } + + /** + * Gets the hostname (minus any initial "www." along with the left most + * directories on the path. + * + * Config: + * field Field containing the URL + * dest Field to write the array of strings to + * path_length OPTIONAL (DEFAULT: 0) Number of leftmost subdirectories to include + */ + getUrlDomain(item, config) { + if (!(config.field in item)) { + return null; + } + + let url = new URL(item[config.field]); + let domain = url.hostname.toLocaleLowerCase(); + if (domain.startsWith("www.")) { + domain = domain.substring(4); + } + item[config.dest] = domain; + let pathLength = 0; + if ("path_length" in config) { + pathLength = config.path_length; + } + if (pathLength > 0) { + item[config.dest] += url.pathname + .toLocaleLowerCase() + .split("/") + .slice(0, pathLength + 1) + .join("/"); + } + + return item; + } + + /** + * Splits a field into tokens. + * Config: + * field Field containing a string to tokenize + * dest Field to write the array of strings to + */ + tokenizeField(item, config) { + if (!(config.field in item)) { + return null; + } + + item[config.dest] = this.tokenize(item[config.field]); + + return item; + } + + /** + * Deep copy from one field to another. + * Config: + * src Field to read from + * dest Field to write to + */ + copyValue(item, config) { + if (!(config.src in item)) { + return null; + } + + item[config.dest] = JSON.parse(JSON.stringify(item[config.src])); + + return item; + } + + /** + * Converts a field containing a map of strings to a map of strings + * to numbers, to a map of strings to numbers containing at most k elements. + * This operation is performed by first, promoting all the subkeys up one + * level, and then taking the top (or bottom) k values. + * + * Config: + * field Points to a map of strings to a map of strings to numbers + * k Maximum number of items to keep + * descending OPTIONAL (DEFAULT: True) Sorts score in descending order + * (i.e. keeps maximum) + */ + keepTopK(item, config) { + if (!(config.field in item)) { + return null; + } + let k = this._lookupScalar(item, config.k, 1048576); + let descending = !("descending" in config) || config.descending !== false; + + // we can't sort by the values in the map, so we have to convert this + // to an array, and then sort. + let sortable = []; + Object.keys(item[config.field]).forEach(outerKey => { + let innerType = this._typeOf(item[config.field][outerKey]); + if (innerType === "map") { + Object.keys(item[config.field][outerKey]).forEach(innerKey => { + sortable.push({ + key: innerKey, + value: item[config.field][outerKey][innerKey], + }); + }); + } else { + sortable.push({ key: outerKey, value: item[config.field][outerKey] }); + } + }); + + sortable.sort((a, b) => { + if (descending) { + return b.value - a.value; + } + return a.value - b.value; + }); + + // now take the top k + let newMap = {}; + let i = 0; + for (let pair of sortable) { + if (i >= k) { + break; + } + newMap[pair.key] = pair.value; + i++; + } + item[config.field] = newMap; + + return item; + } + + /** + * Scalar multiplies a vector by some constant + * + * Config: + * field Points to: + * a map of strings to numbers + * an array of numbers + * a number + * k Either a number, or a string. If it's a number then This + * is the scalar value to multiply by. If it's a string, + * the value in the pointed to field is used. + * default OPTIONAL (DEFAULT: 0), If k is a string, and no numeric + * value is found, then use this value. + */ + scalarMultiply(item, config) { + if (!(config.field in item)) { + return null; + } + let k = this._lookupScalar(item, config.k, config.dfault); + + let fieldType = this._typeOf(item[config.field]); + if (fieldType === "number") { + item[config.field] *= k; + } else if (fieldType === "array") { + for (let i = 0; i < item[config.field].length; i++) { + item[config.field][i] *= k; + } + } else if (fieldType === "map") { + Object.keys(item[config.field]).forEach(key => { + item[config.field][key] *= k; + }); + } else { + return null; + } + + return item; + } + + /** + * Elementwise multiplies either two maps or two arrays together, storing + * the result in left. If left and right are of the same type, results in an + * error. + * + * Maps are special case. For maps the left must be a nested map such as: + * { k1: { k11: 1, k12: 2}, k2: { k21: 3, k22: 4 } } and right needs to be + * simple map such as: { k1: 5, k2: 6} . The operation is then to mulitply + * every value of every right key, to every value every subkey where the + * parent keys match. Using the previous examples, the result would be: + * { k1: { k11: 5, k12: 10 }, k2: { k21: 18, k22: 24 } } . + * + * Config: + * left + * right + */ + elementwiseMultiply(item, config) { + if (!(config.left in item) || !(config.right in item)) { + return null; + } + let leftType = this._typeOf(item[config.left]); + if (leftType !== this._typeOf(item[config.right])) { + return null; + } + if (leftType === "array") { + if (item[config.left].length !== item[config.right].length) { + return null; + } + for (let i = 0; i < item[config.left].length; i++) { + item[config.left][i] *= item[config.right][i]; + } + } else if (leftType === "map") { + Object.keys(item[config.left]).forEach(outerKey => { + let r = 0.0; + if (outerKey in item[config.right]) { + r = item[config.right][outerKey]; + } + Object.keys(item[config.left][outerKey]).forEach(innerKey => { + item[config.left][outerKey][innerKey] *= r; + }); + }); + } else if (leftType === "number") { + item[config.left] *= item[config.right]; + } else { + return null; + } + + return item; + } + + /** + * Vector multiplies (i.e. dot products) two vectors and stores the result in + * third field. Both vectors must either by maps, or arrays of numbers with + * the same length. + * + * Config: + * left A field pointing to either a map of strings to numbers, + * or an array of numbers + * right A field pointing to either a map of strings to numbers, + * or an array of numbers + * dest The field to store the dot product. + */ + vectorMultiply(item, config) { + if (!(config.left in item) || !(config.right in item)) { + return null; + } + + let leftType = this._typeOf(item[config.left]); + if (leftType !== this._typeOf(item[config.right])) { + return null; + } + + let destVal = 0.0; + if (leftType === "array") { + if (item[config.left].length !== item[config.right].length) { + return null; + } + for (let i = 0; i < item[config.left].length; i++) { + destVal += item[config.left][i] * item[config.right][i]; + } + } else if (leftType === "map") { + Object.keys(item[config.left]).forEach(key => { + if (key in item[config.right]) { + destVal += item[config.left][key] * item[config.right][key]; + } + }); + } else { + return null; + } + + item[config.dest] = destVal; + return item; + } + + /** + * Adds a constant value to all elements in the field. Mathematically, + * this is the same as taking a 1-vector, scalar multiplying it by k, + * and then vector adding it to a field. + * + * Config: + * field A field pointing to either a map of strings to numbers, + * or an array of numbers + * k Either a number, or a string. If it's a number then This + * is the scalar value to multiply by. If it's a string, + * the value in the pointed to field is used. + * default OPTIONAL (DEFAULT: 0), If k is a string, and no numeric + * value is found, then use this value. + */ + scalarAdd(item, config) { + let k = this._lookupScalar(item, config.k, config.dfault); + if (!(config.field in item)) { + return null; + } + + let fieldType = this._typeOf(item[config.field]); + if (fieldType === "array") { + for (let i = 0; i < item[config.field].length; i++) { + item[config.field][i] += k; + } + } else if (fieldType === "map") { + Object.keys(item[config.field]).forEach(key => { + item[config.field][key] += k; + }); + } else if (fieldType === "number") { + item[config.field] += k; + } else { + return null; + } + + return item; + } + + /** + * Adds two vectors together and stores the result in left. + * + * Config: + * left A field pointing to either a map of strings to numbers, + * or an array of numbers + * right A field pointing to either a map of strings to numbers, + * or an array of numbers + */ + vectorAdd(item, config) { + if (!(config.left in item)) { + return this.copyValue(item, { src: config.right, dest: config.left }); + } + if (!(config.right in item)) { + return null; + } + + let leftType = this._typeOf(item[config.left]); + if (leftType !== this._typeOf(item[config.right])) { + return null; + } + if (leftType === "array") { + if (item[config.left].length !== item[config.right].length) { + return null; + } + for (let i = 0; i < item[config.left].length; i++) { + item[config.left][i] += item[config.right][i]; + } + return item; + } else if (leftType === "map") { + Object.keys(item[config.right]).forEach(key => { + let v = 0; + if (key in item[config.left]) { + v = item[config.left][key]; + } + item[config.left][key] = v + item[config.right][key]; + }); + return item; + } + + return null; + } + + /** + * Converts a vector from real values to boolean integers. (i.e. either 1/0 + * or 1/-1). + * + * Config: + * field Field containing either a map of strings to numbers or + * an array of numbers to convert. + * threshold OPTIONAL (DEFAULT: 0) Values above this will be replaced + * with 1.0. Those below will be converted to 0. + * keep_negative OPTIONAL (DEFAULT: False) If true, values below the + * threshold will be converted to -1 instead of 0. + */ + makeBoolean(item, config) { + if (!(config.field in item)) { + return null; + } + let threshold = this._lookupScalar(item, config.threshold, 0.0); + let type = this._typeOf(item[config.field]); + if (type === "array") { + for (let i = 0; i < item[config.field].length; i++) { + if (item[config.field][i] > threshold) { + item[config.field][i] = 1.0; + } else if (config.keep_negative) { + item[config.field][i] = -1.0; + } else { + item[config.field][i] = 0.0; + } + } + } else if (type === "map") { + Object.keys(item[config.field]).forEach(key => { + let value = item[config.field][key]; + if (value > threshold) { + item[config.field][key] = 1.0; + } else if (config.keep_negative) { + item[config.field][key] = -1.0; + } else { + item[config.field][key] = 0.0; + } + }); + } else if (type === "number") { + let value = item[config.field]; + if (value > threshold) { + item[config.field] = 1.0; + } else if (config.keep_negative) { + item[config.field] = -1.0; + } else { + item[config.field] = 0.0; + } + } else { + return null; + } + + return item; + } + + /** + * Removes all keys from the item except for the ones specified. + * + * fields An array of strings indicating the fields to keep + */ + allowFields(item, config) { + let newItem = {}; + for (let ele of config.fields) { + if (ele in item) { + newItem[ele] = item[ele]; + } + } + return newItem; + } + + /** + * Removes all keys whose value does not exceed some threshold. + * + * Config: + * field Points to a map of strings to numbers + * threshold Values must exceed this value, otherwise they are removed. + */ + filterByValue(item, config) { + if (!(config.field in item)) { + return null; + } + let threshold = this._lookupScalar(item, config.threshold, 0.0); + let filtered = {}; + Object.keys(item[config.field]).forEach(key => { + let value = item[config.field][key]; + if (value > threshold) { + filtered[key] = value; + } + }); + item[config.field] = filtered; + + return item; + } + + /** + * Rewrites a field so that its values are now L2 normed. + * + * Config: + * field Points to a map of strings to numbers, or an array of numbers + */ + l2Normalize(item, config) { + if (!(config.field in item)) { + return null; + } + let data = item[config.field]; + let type = this._typeOf(data); + if (type === "array") { + let norm = 0.0; + for (let datum of data) { + norm += datum * datum; + } + norm = Math.sqrt(norm); + if (norm !== 0) { + for (let i = 0; i < data.length; i++) { + data[i] /= norm; + } + } + } else if (type === "map") { + let norm = 0.0; + Object.keys(data).forEach(key => { + norm += data[key] * data[key]; + }); + norm = Math.sqrt(norm); + if (norm !== 0) { + Object.keys(data).forEach(key => { + data[key] /= norm; + }); + } + } else { + return null; + } + + item[config.field] = data; + + return item; + } + + /** + * Rewrites a field so that all of its values sum to 1.0 + * + * Config: + * field Points to a map of strings to numbers, or an array of numbers + */ + probNormalize(item, config) { + if (!(config.field in item)) { + return null; + } + let data = item[config.field]; + let type = this._typeOf(data); + if (type === "array") { + let norm = 0.0; + for (let datum of data) { + norm += datum; + } + if (norm !== 0) { + for (let i = 0; i < data.length; i++) { + data[i] /= norm; + } + } + } else if (type === "map") { + let norm = 0.0; + Object.keys(item[config.field]).forEach(key => { + norm += item[config.field][key]; + }); + if (norm !== 0) { + Object.keys(item[config.field]).forEach(key => { + item[config.field][key] /= norm; + }); + } + } else { + return null; + } + + return item; + } + + /** + * Stores a value, if it is not already present + * + * Config: + * field field to write to if it is missing + * value value to store in that field + */ + setDefault(item, config) { + let val = this._lookupScalar(item, config.value, config.value); + if (!(config.field in item)) { + item[config.field] = val; + } + + return item; + } + + /** + * Selctively promotes an value from an inner map up to the outer map + * + * Config: + * haystack Points to a map of strings to values + * needle Key inside the map we should promote up + * dest Where we should write the value of haystack[needle] + */ + lookupValue(item, config) { + if (config.haystack in item && config.needle in item[config.haystack]) { + item[config.dest] = item[config.haystack][config.needle]; + } + + return item; + } + + /** + * Demotes a field into a map + * + * Config: + * src Field to copy + * dest_map Points to a map + * dest_key Key inside dest_map to copy src to + */ + copyToMap(item, config) { + if (config.src in item) { + if (!(config.dest_map in item)) { + item[config.dest_map] = {}; + } + item[config.dest_map][config.dest_key] = item[config.src]; + } + + return item; + } + + /** + * Config: + * field Points to a string to number map + * k Scalar to multiply the values by + * log_scale Boolean, if true, then the values will be transformed + * by a logrithm prior to multiplications + */ + scalarMultiplyTag(item, config) { + let EPSILON = 0.000001; + if (!(config.field in item)) { + return null; + } + let k = this._lookupScalar(item, config.k, 1); + let type = this._typeOf(item[config.field]); + if (type === "map") { + Object.keys(item[config.field]).forEach(parentKey => { + Object.keys(item[config.field][parentKey]).forEach(key => { + let v = item[config.field][parentKey][key]; + if (config.log_scale) { + v = Math.log(v + EPSILON); + } + item[config.field][parentKey][key] = v * k; + }); + }); + } else { + return null; + } + + return item; + } + + /** + * Independently applies softmax across all subtags. + * + * Config: + * field Points to a map of strings with values being another map of strings + */ + applySoftmaxTags(item, config) { + let type = this._typeOf(item[config.field]); + if (type !== "map") { + return null; + } + + let abort = false; + let softmaxSum = {}; + Object.keys(item[config.field]).forEach(tag => { + if (this._typeOf(item[config.field][tag]) !== "map") { + abort = true; + return; + } + if (abort) { + return; + } + softmaxSum[tag] = 0; + Object.keys(item[config.field][tag]).forEach(subtag => { + if (this._typeOf(item[config.field][tag][subtag]) !== "number") { + abort = true; + return; + } + let score = item[config.field][tag][subtag]; + softmaxSum[tag] += Math.exp(score); + }); + }); + if (abort) { + return null; + } + + Object.keys(item[config.field]).forEach(tag => { + Object.keys(item[config.field][tag]).forEach(subtag => { + item[config.field][tag][subtag] = + Math.exp(item[config.field][tag][subtag]) / softmaxSum[tag]; + }); + }); + + return item; + } + + /** + * Vector adds a field and stores the result in left. + * + * Config: + * field The field to vector add + */ + combinerAdd(left, right, config) { + if (!(config.field in right)) { + return left; + } + let type = this._typeOf(right[config.field]); + if (!(config.field in left)) { + if (type === "map") { + left[config.field] = {}; + } else if (type === "array") { + left[config.field] = []; + } else if (type === "number") { + left[config.field] = 0; + } else { + return null; + } + } + if (type !== this._typeOf(left[config.field])) { + return null; + } + if (type === "map") { + Object.keys(right[config.field]).forEach(key => { + if (!(key in left[config.field])) { + left[config.field][key] = 0; + } + left[config.field][key] += right[config.field][key]; + }); + } else if (type === "array") { + for (let i = 0; i < right[config.field].length; i++) { + if (i < left[config.field].length) { + left[config.field][i] += right[config.field][i]; + } else { + left[config.field].push(right[config.field][i]); + } + } + } else if (type === "number") { + left[config.field] += right[config.field]; + } else { + return null; + } + + return left; + } + + /** + * Stores the maximum value of the field in left. + * + * Config: + * field The field to vector add + */ + combinerMax(left, right, config) { + if (!(config.field in right)) { + return left; + } + let type = this._typeOf(right[config.field]); + if (!(config.field in left)) { + if (type === "map") { + left[config.field] = {}; + } else if (type === "array") { + left[config.field] = []; + } else if (type === "number") { + left[config.field] = 0; + } else { + return null; + } + } + if (type !== this._typeOf(left[config.field])) { + return null; + } + if (type === "map") { + Object.keys(right[config.field]).forEach(key => { + if ( + !(key in left[config.field]) || + right[config.field][key] > left[config.field][key] + ) { + left[config.field][key] = right[config.field][key]; + } + }); + } else if (type === "array") { + for (let i = 0; i < right[config.field].length; i++) { + if (i < left[config.field].length) { + if (left[config.field][i] < right[config.field][i]) { + left[config.field][i] = right[config.field][i]; + } + } else { + left[config.field].push(right[config.field][i]); + } + } + } else if (type === "number") { + if (left[config.field] < right[config.field]) { + left[config.field] = right[config.field]; + } + } else { + return null; + } + + return left; + } + + /** + * Associates a value in right with another value in right. This association + * is then stored in a map in left. + * + * For example: If a sequence of rights is: + * { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 41 } + * { 'tags': {}, 'url_domain': 'mbusa.com/mercedes', 'time': 21 } + * { 'tags': {}, 'url_domain': 'maseratiusa.com/maserati', 'time': 34 } + * + * Then assuming a 'sum' operation, left can build a map that would look like: + * { + * 'maseratiusa.com/maserati': 75, + * 'mbusa.com/mercedes': 21, + * } + * + * Fields: + * left_field field in the left to store / update the map + * right_key_field Field in the right to use as a key + * right_value_field Field in the right to use as a value + * operation One of "sum", "max", "overwrite", "count" + */ + combinerCollectValues(left, right, config) { + let op; + if (config.operation === "sum") { + op = (a, b) => a + b; + } else if (config.operation === "max") { + op = (a, b) => (a > b ? a : b); + } else if (config.operation === "overwrite") { + op = (a, b) => b; + } else if (config.operation === "count") { + op = (a, b) => a + 1; + } else { + return null; + } + if (!(config.left_field in left)) { + left[config.left_field] = {}; + } + if ( + !(config.right_key_field in right) || + !(config.right_value_field in right) + ) { + return left; + } + + let key = right[config.right_key_field]; + let rightValue = right[config.right_value_field]; + let leftValue = 0.0; + if (key in left[config.left_field]) { + leftValue = left[config.left_field][key]; + } + + left[config.left_field][key] = op(leftValue, rightValue); + + return left; + } + + /** + * Executes a recipe. Returns an object on success, or null on failure. + */ + executeRecipe(item, recipe) { + let newItem = item; + if (recipe) { + for (let step of recipe) { + let op = this.ITEM_BUILDER_REGISTRY[step.function]; + if (op === undefined) { + return null; + } + newItem = op.call(this, newItem, step); + if (newItem === null) { + break; + } + } + } + return newItem; + } + + /** + * Executes a recipe. Returns an object on success, or null on failure. + */ + executeCombinerRecipe(item1, item2, recipe) { + let newItem1 = item1; + for (let step of recipe) { + let op = this.ITEM_COMBINER_REGISTRY[step.function]; + if (op === undefined) { + return null; + } + newItem1 = op.call(this, newItem1, item2, step); + if (newItem1 === null) { + break; + } + } + + return newItem1; + } +}; diff --git a/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm b/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm new file mode 100644 index 0000000000..94835557a6 --- /dev/null +++ b/browser/components/newtab/lib/PersonalityProvider/Tokenize.jsm @@ -0,0 +1,89 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +// We load this into a worker using importScripts, and in tests using import. +// We use var to avoid name collision errors. +// eslint-disable-next-line no-var +var EXPORTED_SYMBOLS = ["tokenize", "toksToTfIdfVector"]; + +// Unicode specifies certain mnemonics for code pages and character classes. +// They call them "character properties" https://en.wikipedia.org/wiki/Unicode_character_property . +// These mnemonics are have been adopted by many regular expression libraries, +// however the standard Javascript regexp system doesn't support unicode +// character properties, so we have to define these ourself. +// +// Each of these sections contains the characters values / ranges for specific +// character property: Whitespace, Symbol (S), Punctuation (P), Number (N), +// Mark (M), and Letter (L). +const UNICODE_SPACE = + "\x20\xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000"; +const UNICODE_SYMBOL = + "\\x24\\x2B\x3C-\x3E\\x5E\x60\\x7C\x7E\xA2-\xA6\xA8\xA9\xAC\xAE-\xB1\xB4\xB8\xD7\xF7\u02C2-\u02C5\u02D2-\u02DF\u02E5-\u02EB\u02ED\u02EF-\u02FF\u0375\u0384\u0385\u03F6\u0482\u058D-\u058F\u0606-\u0608\u060B\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09F2\u09F3\u09FA\u09FB\u0AF1\u0B70\u0BF3-\u0BFA\u0C7F\u0D4F\u0D79\u0E3F\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u17DB\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u1FBD\u1FBF-\u1FC1\u1FCD-\u1FCF\u1FDD-\u1FDF\u1FED-\u1FEF\u1FFD\u1FFE\u2044\u2052\u207A-\u207C\u208A-\u208C\u20A0-\u20BE\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u2140-\u2144\u214A-\u214D\u214F\u218A\u218B\u2190-\u2307\u230C-\u2328\u232B-\u23FE\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u2767\u2794-\u27C4\u27C7-\u27E5\u27F0-\u2982\u2999-\u29D7\u29DC-\u29FB\u29FE-\u2B73\u2B76-\u2B95\u2B98-\u2BB9\u2BBD-\u2BC8\u2BCA-\u2BD1\u2BEC-\u2BEF\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u309B\u309C\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA700-\uA716\uA720\uA721\uA789\uA78A\uA828-\uA82B\uA836-\uA839\uAA77-\uAA79\uAB5B\uFB29\uFBB2-\uFBC1\uFDFC\uFDFD\uFE62\uFE64-\uFE66\uFE69\uFF04\uFF0B\uFF1C-\uFF1E\uFF3E\uFF40\uFF5C\uFF5E\uFFE0-\uFFE6\uFFE8-\uFFEE\uFFFC\uFFFD"; +const UNICODE_PUNCT = + "\x21-\x23\x25-\\x2A\x2C-\x2F\x3A\x3B\\x3F\x40\\x5B-\\x5D\x5F\\x7B\x7D\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E44\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65"; + +const UNICODE_NUMBER = + "0-9\xB2\xB3\xB9\xBC-\xBE\u0660-\u0669\u06F0-\u06F9\u07C0-\u07C9\u0966-\u096F\u09E6-\u09EF\u09F4-\u09F9\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0B72-\u0B77\u0BE6-\u0BF2\u0C66-\u0C6F\u0C78-\u0C7E\u0CE6-\u0CEF\u0D58-\u0D5E\u0D66-\u0D78\u0DE6-\u0DEF\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F33\u1040-\u1049\u1090-\u1099\u1369-\u137C\u16EE-\u16F0\u17E0-\u17E9\u17F0-\u17F9\u1810-\u1819\u1946-\u194F\u19D0-\u19DA\u1A80-\u1A89\u1A90-\u1A99\u1B50-\u1B59\u1BB0-\u1BB9\u1C40-\u1C49\u1C50-\u1C59\u2070\u2074-\u2079\u2080-\u2089\u2150-\u2182\u2185-\u2189\u2460-\u249B\u24EA-\u24FF\u2776-\u2793\u2CFD\u3007\u3021-\u3029\u3038-\u303A\u3192-\u3195\u3220-\u3229\u3248-\u324F\u3251-\u325F\u3280-\u3289\u32B1-\u32BF\uA620-\uA629\uA6E6-\uA6EF\uA830-\uA835\uA8D0-\uA8D9\uA900-\uA909\uA9D0-\uA9D9\uA9F0-\uA9F9\uAA50-\uAA59\uABF0-\uABF9\uFF10-\uFF19"; +const UNICODE_MARK = + "\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08D4-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D01-\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ABE\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2-\u1CF4\u1CF8\u1CF9\u1DC0-\u1DF5\u1DFB-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F"; +const UNICODE_LETTER = + "A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0-\u08B4\u08B6-\u08BD\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1877\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1C80-\u1C88\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FD5\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA7AE\uA7B0-\uA7B7\uA7F7-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uA9E0-\uA9E4\uA9E6-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB65\uAB70-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC"; + +const REGEXP_SPLITS = new RegExp( + `[${UNICODE_SPACE}${UNICODE_SYMBOL}${UNICODE_PUNCT}]+` +); +// Match all token characters, so okay for regex to split multiple code points +// eslint-disable-next-line no-misleading-character-class +const REGEXP_ALPHANUMS = new RegExp( + `^[${UNICODE_NUMBER}${UNICODE_MARK}${UNICODE_LETTER}]+$` +); + +/** + * Downcases the text, and splits it into consecutive alphanumeric characters. + * This is locale aware, and so will not strip accents. This uses "word + * breaks", and os is not appropriate for languages without them + * (e.g. Chinese). + */ +function tokenize(text) { + return text + .toLocaleLowerCase() + .split(REGEXP_SPLITS) + .filter(tok => tok.match(REGEXP_ALPHANUMS)); +} + +/** + * Converts a sequence of tokens into an L2 normed TF-IDF. Any terms that are + * not preindexed (i.e. does have a computed inverse document frequency) will + * be dropped. + */ +function toksToTfIdfVector(tokens, vocab_idfs) { + let tfidfs = {}; + + // calcualte the term frequencies + for (let tok of tokens) { + if (!(tok in vocab_idfs)) { + continue; + } + if (!(tok in tfidfs)) { + tfidfs[tok] = [vocab_idfs[tok][0], 1]; + } else { + tfidfs[tok][1]++; + } + } + + // now multiply by the log inverse document frequencies, then take + // the L2 norm of this. + let l2Norm = 0.0; + Object.keys(tfidfs).forEach(tok => { + tfidfs[tok][1] *= vocab_idfs[tok][1]; + l2Norm += tfidfs[tok][1] * tfidfs[tok][1]; + }); + l2Norm = Math.sqrt(l2Norm); + Object.keys(tfidfs).forEach(tok => { + tfidfs[tok][1] /= l2Norm; + }); + + return tfidfs; +} |