summaryrefslogtreecommitdiffstats
path: root/toolkit/components/translations/content/language-id-engine-worker.js
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/translations/content/language-id-engine-worker.js')
-rw-r--r--toolkit/components/translations/content/language-id-engine-worker.js327
1 files changed, 327 insertions, 0 deletions
diff --git a/toolkit/components/translations/content/language-id-engine-worker.js b/toolkit/components/translations/content/language-id-engine-worker.js
new file mode 100644
index 0000000000..1323b505d2
--- /dev/null
+++ b/toolkit/components/translations/content/language-id-engine-worker.js
@@ -0,0 +1,327 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* eslint-env mozilla/chrome-worker */
+"use strict";
+
+// Throw Promise rejection errors so that they are visible in the console.
+self.addEventListener("unhandledrejection", event => {
+ throw event.reason;
+});
+
+/* global addOnPostRun FastText loadFastText */
+importScripts(
+ "chrome://global/content/translations/fasttext.js",
+ "chrome://global/content/translations/fasttext_wasm.js"
+);
+
+/**
+ * The number of languages that should be returned when the model analyzes text.
+ *
+ * A value of 1 means only the most-likely language will be returned.
+ * A value of 5 would mean that the top 5 most-likely languages will be returned.
+ */
+const LANGUAGE_COUNT = 1;
+
+/**
+ * The threshold of likelihood in range [0.0, 1.0] that must pass
+ * for a language to be returned from the model.
+ *
+ * A value of 0.0 would mean that a language is always returned with any confidence.
+ * A value of 0.5 would mean that a language is only returned if the model
+ * is 50% confident that the analyzed text could be that language.
+ */
+const CONFIDENCE_THRESHOLD = 0.0;
+
+// Respect the preference "browser.translations.logLevel".
+let _isLoggingEnabled = true;
+function log(...args) {
+ if (_isLoggingEnabled) {
+ console.log("Translations:", ...args);
+ }
+}
+
+// Wait for the initialization request.
+addEventListener("message", handleInitializationMessage);
+
+/**
+ * Initialize the engine, and get it ready to handle language identification requests.
+ * The "initialize" message must be received before any other message handling
+ * requests will be processed.
+ *
+ * @param {Object} event
+ * @param {Object} event.data
+ * @param {string} event.data.type - The message type, expects "initialize".
+ * @param {ArrayBuffer} event.data.wasmBuffer - The buffer containing the wasm binary.
+ * @param {ArrayBuffer} event.data.modelBuffer - The buffer containing the language-id model binary.
+ * @param {null | string} event.data.mockedLangTag - The mocked language tag value (only present when mocking).
+ * @param {null | number} event.data.mockedConfidence - The mocked confidence value (only present when mocking).
+ * @param {boolean} event.data.isLoggingEnabled
+ */
+async function handleInitializationMessage({ data }) {
+ if (data.type !== "initialize") {
+ throw new Error(
+ "The LanguageIdEngine worker received a message before it was initialized."
+ );
+ }
+
+ try {
+ const { isLoggingEnabled } = data;
+ if (isLoggingEnabled) {
+ // Respect the "browser.translations.logLevel" preference.
+ _isLoggingEnabled = true;
+ }
+
+ /** @type {LanguageIdEngine | MockedLanguageIdEngine} */
+ let languageIdEngine;
+ const { mockedLangTag, mockedConfidence } = data;
+ if (mockedLangTag !== null && mockedConfidence !== null) {
+ // Don't actually use the engine as it is mocked.
+ languageIdEngine = new MockedLanguageIdEngine(
+ mockedLangTag,
+ mockedConfidence
+ );
+ } else {
+ languageIdEngine = await initializeLanguageIdEngine(data);
+ }
+
+ handleMessages(languageIdEngine);
+ postMessage({ type: "initialization-success" });
+ } catch (error) {
+ console.error(error);
+ postMessage({ type: "initialization-error", error: error?.message });
+ }
+
+ removeEventListener("message", handleInitializationMessage);
+}
+
+/**
+ * Initializes the fastText wasm runtime and returns the fastText model.
+ *
+ * @param {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
+ * @param {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
+ * @returns {FastTextModel}
+ */
+function initializeFastTextModel(modelBuffer, wasmBuffer) {
+ return new Promise((resolve, reject) => {
+ const initialModule = {
+ onAbort() {
+ reject(new Error("Error loading the fastText Wasm Module"));
+ },
+ onRuntimeInitialized() {
+ addOnPostRun(() => {
+ const ft = new FastText(initialModule);
+ const model = ft.loadModelBinary(modelBuffer);
+ resolve(model);
+ });
+ },
+ wasmBinary: wasmBuffer,
+ };
+ loadFastText(initialModule);
+ });
+}
+
+/**
+ * Initialize the LanguageIdEngine from the data payload by loading
+ * the fastText wasm runtime and model and constructing the engine.
+ *
+ * @param {Object} data
+ * @property {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
+ * @property {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
+ */
+async function initializeLanguageIdEngine(data) {
+ const { modelBuffer, wasmBuffer } = data;
+ if (!modelBuffer) {
+ throw new Error('LanguageIdEngine initialization missing "modelBuffer"');
+ }
+ if (!wasmBuffer) {
+ throw new Error('LanguageIdEngine initialization missing "wasmBuffer"');
+ }
+ const model = await initializeFastTextModel(modelBuffer, wasmBuffer);
+ return new LanguageIdEngine(model);
+}
+
+/**
+ * Sets up the message handling for the worker.
+ *
+ * @param {LanguageIdEngine | MockedLanguageIdEngine} languageIdEngine
+ */
+function handleMessages(languageIdEngine) {
+ /**
+ * Handle any message after the initialization message.
+ *
+ * @param {Object} data
+ * @property {string} data.type - The message type.
+ * @property {string} data.message - The message text to identify the language of.
+ * @property {number} data.messageId - The ID of the message.
+ */
+ addEventListener("message", ({ data }) => {
+ try {
+ if (data.type === "initialize") {
+ throw new Error(
+ "The language-identification engine must not be re-initialized."
+ );
+ }
+ switch (data.type) {
+ case "language-id-request": {
+ const { message, messageId } = data;
+ try {
+ const [confidence, langTag] =
+ languageIdEngine.identifyLanguage(message);
+ postMessage({
+ type: "language-id-response",
+ langTag,
+ confidence,
+ messageId,
+ });
+ } catch (error) {
+ console.error(error);
+ postMessage({
+ type: "language-id-error",
+ messageId,
+ });
+ }
+ break;
+ }
+ default: {
+ console.warn("Unknown message type:", data.type);
+ }
+ }
+ } catch (error) {
+ // Ensure the unexpected errors are surfaced in the console.
+ console.error(error);
+ }
+ });
+}
+
+/**
+ * The LanguageIdEngine wraps around a machine-learning model that can identify text
+ * as being written in a given human language. The engine is responsible for invoking
+ * model and returning the language tag in the format that is expected by firefox
+ * translations code.
+ */
+class LanguageIdEngine {
+ /** @type {FastTextModel} */
+ #model;
+
+ /**
+ * @param {FastTextModel} model
+ */
+ constructor(model) {
+ this.#model = model;
+ }
+
+ /**
+ * Formats the language tag returned by the language-identification model to match
+ * conform to the format used internally by Firefox.
+ *
+ * This function is currently configured to handle the fastText language-identification
+ * model. Updating the language-identification model or moving to something other than
+ * fastText in the future will likely require updating this function.
+ *
+ * @param {string} langTag
+ * @returns {string} The correctly formatted langTag
+ */
+ #formatLangTag(langTag) {
+ // The fastText language model returns values of the format "__label__{langTag}".
+ // As such, this function strips the "__label__" prefix, leaving only the langTag.
+ let formattedTag = langTag.replace("__label__", "");
+
+ // fastText is capable of returning any of a predetermined set of 176 langTags:
+ // https://fasttext.cc/docs/en/language-identification.html
+ //
+ // These tags come from ISO639-3:
+ // https://iso639-3.sil.org/code_tables/deprecated_codes/data
+ //
+ // Each of these tags have been cross checked for compatibility with the IANA
+ // language subtag registry, which is used by BCP 47, and any edge cases are handled below.
+ // https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+ switch (formattedTag) {
+ // fastText may return "eml" which is a deprecated ISO639-3 language tag for the language
+ // Emiliano-Romagnolo. It was split into two separate tags "egl" and "rgn":
+ // https://iso639-3.sil.org/request/2008-040
+ //
+ // "eml" was once requested to be added to the IANA registry, but it was denied:
+ // https://www.alvestrand.no/pipermail/ietf-languages/2009-December/009754.html
+ //
+ // This case should return either "egl" or "rgn", given that the "eml" tag was split.
+ // However, given that the fastText model does not distinguish between the two by using
+ // the deprecated tag, this function will default to "egl" because it is alphabetically first.
+ //
+ // At such a time that Firefox Translations may support either of these languages, we should consider
+ // a way to further distinguish between the two languages at that time.
+ case "eml": {
+ formattedTag = "egl";
+ break;
+ }
+ // The fastText model returns "no" for Norwegian Bokmål.
+ //
+ // According to advice from https://r12a.github.io/app-subtags/
+ // "no" is a macro language that encompasses the following more specific primary language subtags: "nb" "nn".
+ // It is recommended to use more specific language subtags as long as it does not break legacy usage of an application.
+ // As such, this function will return "nb" for Norwegian Bokmål instead of "no" as reported by fastText.
+ case "no": {
+ formattedTag = "nb";
+ break;
+ }
+ }
+ return formattedTag;
+ }
+
+ /**
+ * Identifies the human language in which the message is written and returns
+ * the BCP 47 language tag of the language it is determined to be along along
+ * with a rating of how confident the model is that the label is correct.
+ *
+ * @param {string} message
+ * @returns {Array<number | string>} An array containing the confidence and language tag.
+ * The confidence is a number between 0 and 1, representing a percentage.
+ * The language tag is a BCP 47 language tag such as "en" for English.
+ *
+ * e.g. [0.87, "en"]
+ */
+ identifyLanguage(message) {
+ const mostLikelyLanguageData = this.#model
+ .predict(message.trim(), LANGUAGE_COUNT, CONFIDENCE_THRESHOLD)
+ .get(0);
+
+ // This should never fail as long as
+ // LANGUAGE_COUNT > 1 && CONFIDENCE_THRESHOLD === 0.0
+ if (!mostLikelyLanguageData) {
+ throw new Error("Unable to identify a language");
+ }
+
+ const [confidence, langTag] = mostLikelyLanguageData;
+ return [confidence, this.#formatLangTag(langTag)];
+ }
+}
+
+/**
+ * For testing purposes, provide a fully mocked engine. This allows for easy integration
+ * testing of the UI, without having to rely on downloading remote models and remote
+ * wasm binaries.
+ */
+class MockedLanguageIdEngine {
+ /** @type {string} */
+ #langTag;
+ /** @type {number} */
+ #confidence;
+
+ /**
+ * @param {string} langTag
+ * @param {number} confidence
+ */
+ constructor(langTag, confidence) {
+ this.#langTag = langTag;
+ this.#confidence = confidence;
+ }
+
+ /**
+ * Mocks identifying a language by returning the mocked engine's pre-determined
+ * language tag and confidence values.
+ */
+ identifyLanguage(_message) {
+ return [this.#confidence, this.#langTag];
+ }
+}