1 files changed, 327 insertions, 0 deletions
diff --git a/toolkit/components/translations/content/language-id-engine-worker.js b/toolkit/components/translations/content/language-id-engine-worker.js
new file mode 100644
index 0000000000..1323b505d2
--- /dev/null
+++ b/toolkit/components/translations/content/language-id-engine-worker.js
@@ -0,0 +1,327 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/* eslint-env mozilla/chrome-worker */
+"use strict";
+
+// Throw Promise rejection errors so that they are visible in the console.
+self.addEventListener("unhandledrejection", event => {
+  throw event.reason;
+});
+
+/* global addOnPostRun FastText loadFastText */
+importScripts(
+  "chrome://global/content/translations/fasttext.js",
+  "chrome://global/content/translations/fasttext_wasm.js"
+);
+
+/**
+ * The number of languages that should be returned when the model analyzes text.
+ *
+ * A value of 1 means only the most-likely language will be returned.
+ * A value of 5 would mean that the top 5 most-likely languages will be returned.
+ */
+const LANGUAGE_COUNT = 1;
+
+/**
+ * The threshold of likelihood in range [0.0, 1.0] that must pass
+ * for a language to be returned from the model.
+ *
+ * A value of 0.0 would mean that a language is always returned with any confidence.
+ * A value of 0.5 would mean that a language is only returned if the model
+ * is 50% confident that the analyzed text could be that language.
+ */
+const CONFIDENCE_THRESHOLD = 0.0;
+
+// Respect the preference "browser.translations.logLevel".
+let _isLoggingEnabled = true;
+function log(...args) {
+  if (_isLoggingEnabled) {
+    console.log("Translations:", ...args);
+  }
+}
+
+// Wait for the initialization request.
+addEventListener("message", handleInitializationMessage);
+
+/**
+ * Initialize the engine, and get it ready to handle language identification requests.
+ * The "initialize" message must be received before any other message handling
+ * requests will be processed.
+ *
+ * @param {Object} event
+ * @param {Object} event.data
+ * @param {string} event.data.type - The message type, expects "initialize".
+ * @param {ArrayBuffer} event.data.wasmBuffer - The buffer containing the wasm binary.
+ * @param {ArrayBuffer} event.data.modelBuffer - The buffer containing the language-id model binary.
+ * @param {null | string} event.data.mockedLangTag - The mocked language tag value (only present when mocking).
+ * @param {null | number} event.data.mockedConfidence - The mocked confidence value (only present when mocking).
+ * @param {boolean} event.data.isLoggingEnabled
+ */
+async function handleInitializationMessage({ data }) {
+  if (data.type !== "initialize") {
+    throw new Error(
+      "The LanguageIdEngine worker received a message before it was initialized."
+    );
+  }
+
+  try {
+    const { isLoggingEnabled } = data;
+    if (isLoggingEnabled) {
+      // Respect the "browser.translations.logLevel" preference.
+      _isLoggingEnabled = true;
+    }
+
+    /** @type {LanguageIdEngine | MockedLanguageIdEngine} */
+    let languageIdEngine;
+    const { mockedLangTag, mockedConfidence } = data;
+    if (mockedLangTag !== null && mockedConfidence !== null) {
+      // Don't actually use the engine as it is mocked.
+      languageIdEngine = new MockedLanguageIdEngine(
+        mockedLangTag,
+        mockedConfidence
+      );
+    } else {
+      languageIdEngine = await initializeLanguageIdEngine(data);
+    }
+
+    handleMessages(languageIdEngine);
+    postMessage({ type: "initialization-success" });
+  } catch (error) {
+    console.error(error);
+    postMessage({ type: "initialization-error", error: error?.message });
+  }
+
+  removeEventListener("message", handleInitializationMessage);
+}
+
+/**
+ * Initializes the fastText wasm runtime and returns the fastText model.
+ *
+ * @param {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
+ * @param {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
+ * @returns {FastTextModel}
+ */
+function initializeFastTextModel(modelBuffer, wasmBuffer) {
+  return new Promise((resolve, reject) => {
+    const initialModule = {
+      onAbort() {
+        reject(new Error("Error loading the fastText Wasm Module"));
+      },
+      onRuntimeInitialized() {
+        addOnPostRun(() => {
+          const ft = new FastText(initialModule);
+          const model = ft.loadModelBinary(modelBuffer);
+          resolve(model);
+        });
+      },
+      wasmBinary: wasmBuffer,
+    };
+    loadFastText(initialModule);
+  });
+}
+
+/**
+ * Initialize the LanguageIdEngine from the data payload by loading
+ * the fastText wasm runtime and model and constructing the engine.
+ *
+ * @param {Object} data
+ * @property {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary.
+ * @property {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary.
+ */
+async function initializeLanguageIdEngine(data) {
+  const { modelBuffer, wasmBuffer } = data;
+  if (!modelBuffer) {
+    throw new Error('LanguageIdEngine initialization missing "modelBuffer"');
+  }
+  if (!wasmBuffer) {
+    throw new Error('LanguageIdEngine initialization missing "wasmBuffer"');
+  }
+  const model = await initializeFastTextModel(modelBuffer, wasmBuffer);
+  return new LanguageIdEngine(model);
+}
+
+/**
+ * Sets up the message handling for the worker.
+ *
+ * @param {LanguageIdEngine | MockedLanguageIdEngine} languageIdEngine
+ */
+function handleMessages(languageIdEngine) {
+  /**
+   * Handle any message after the initialization message.
+   *
+   * @param {Object} data
+   * @property {string} data.type - The message type.
+   * @property {string} data.message - The message text to identify the language of.
+   * @property {number} data.messageId - The ID of the message.
+   */
+  addEventListener("message", ({ data }) => {
+    try {
+      if (data.type === "initialize") {
+        throw new Error(
+          "The language-identification engine must not be re-initialized."
+        );
+      }
+      switch (data.type) {
+        case "language-id-request": {
+          const { message, messageId } = data;
+          try {
+            const [confidence, langTag] =
+              languageIdEngine.identifyLanguage(message);
+            postMessage({
+              type: "language-id-response",
+              langTag,
+              confidence,
+              messageId,
+            });
+          } catch (error) {
+            console.error(error);
+            postMessage({
+              type: "language-id-error",
+              messageId,
+            });
+          }
+          break;
+        }
+        default: {
+          console.warn("Unknown message type:", data.type);
+        }
+      }
+    } catch (error) {
+      // Ensure the unexpected errors are surfaced in the console.
+      console.error(error);
+    }
+  });
+}
+
+/**
+ * The LanguageIdEngine wraps around a machine-learning model that can identify text
+ * as being written in a given human language. The engine is responsible for invoking
+ * model and returning the language tag in the format that is expected by firefox
+ * translations code.
+ */
+class LanguageIdEngine {
+  /** @type {FastTextModel} */
+  #model;
+
+  /**
+   * @param {FastTextModel} model
+   */
+  constructor(model) {
+    this.#model = model;
+  }
+
+  /**
+   * Formats the language tag returned by the language-identification model to match
+   * conform to the format used internally by Firefox.
+   *
+   * This function is currently configured to handle the fastText language-identification
+   * model. Updating the language-identification model or moving to something other than
+   * fastText in the future will likely require updating this function.
+   *
+   * @param {string} langTag
+   * @returns {string} The correctly formatted langTag
+   */
+  #formatLangTag(langTag) {
+    // The fastText language model returns values of the format "__label__{langTag}".
+    // As such, this function strips the "__label__" prefix, leaving only the langTag.
+    let formattedTag = langTag.replace("__label__", "");
+
+    // fastText is capable of returning any of a predetermined set of 176 langTags:
+    // https://fasttext.cc/docs/en/language-identification.html
+    //
+    // These tags come from ISO639-3:
+    // https://iso639-3.sil.org/code_tables/deprecated_codes/data
+    //
+    // Each of these tags have been cross checked for compatibility with the IANA
+    // language subtag registry, which is used by BCP 47, and any edge cases are handled below.
+    // https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+    switch (formattedTag) {
+      // fastText may return "eml" which is a deprecated ISO639-3 language tag for the language
+      // Emiliano-Romagnolo. It was split into two separate tags "egl" and "rgn":
+      // https://iso639-3.sil.org/request/2008-040
+      //
+      // "eml" was once requested to be added to the IANA registry, but it was denied:
+      // https://www.alvestrand.no/pipermail/ietf-languages/2009-December/009754.html
+      //
+      // This case should return either "egl" or "rgn", given that the "eml" tag was split.
+      // However, given that the fastText model does not distinguish between the two by using
+      // the deprecated tag, this function will default to "egl" because it is alphabetically first.
+      //
+      // At such a time that Firefox Translations may support either of these languages, we should consider
+      // a way to further distinguish between the two languages at that time.
+      case "eml": {
+        formattedTag = "egl";
+        break;
+      }
+      // The fastText model returns "no" for Norwegian Bokmål.
+      //
+      // According to advice from https://r12a.github.io/app-subtags/
+      // "no" is a macro language that encompasses the following more specific primary language subtags: "nb" "nn".
+      // It is recommended to use more specific language subtags as long as it does not break legacy usage of an application.
+      // As such, this function will return "nb" for Norwegian Bokmål instead of "no" as reported by fastText.
+      case "no": {
+        formattedTag = "nb";
+        break;
+      }
+    }
+    return formattedTag;
+  }
+
+  /**
+   * Identifies the human language in which the message is written and returns
+   * the BCP 47 language tag of the language it is determined to be along along
+   * with a rating of how confident the model is that the label is correct.
+   *
+   * @param {string} message
+   * @returns {Array<number | string>} An array containing the confidence and language tag.
+   * The confidence is a number between 0 and 1, representing a percentage.
+   * The language tag is a BCP 47 language tag such as "en" for English.
+   *
+   * e.g. [0.87, "en"]
+   */
+  identifyLanguage(message) {
+    const mostLikelyLanguageData = this.#model
+      .predict(message.trim(), LANGUAGE_COUNT, CONFIDENCE_THRESHOLD)
+      .get(0);
+
+    // This should never fail as long as
+    // LANGUAGE_COUNT > 1 && CONFIDENCE_THRESHOLD === 0.0
+    if (!mostLikelyLanguageData) {
+      throw new Error("Unable to identify a language");
+    }
+
+    const [confidence, langTag] = mostLikelyLanguageData;
+    return [confidence, this.#formatLangTag(langTag)];
+  }
+}
+
+/**
+ * For testing purposes, provide a fully mocked engine. This allows for easy integration
+ * testing of the UI, without having to rely on downloading remote models and remote
+ * wasm binaries.
+ */
+class MockedLanguageIdEngine {
+  /** @type {string} */
+  #langTag;
+  /** @type {number} */
+  #confidence;
+
+  /**
+   * @param {string} langTag
+   * @param {number} confidence
+   */
+  constructor(langTag, confidence) {
+    this.#langTag = langTag;
+    this.#confidence = confidence;
+  }
+
+  /**
+   * Mocks identifying a language by returning the mocked engine's pre-determined
+   * language tag and confidence values.
+   */
+  identifyLanguage(_message) {
+    return [this.#confidence, this.#langTag];
+  }
+}