Adding upstream version 124.0.1.upstream/124.0.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
commit: 26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree: f435a8308119effd964b339f76abb83a57c29483 /toolkit/components/translation/LanguageDetector.sys.mjs
parent: Initial commit. (diff)
download: firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
1 files changed, 192 insertions, 0 deletions
diff --git a/toolkit/components/translation/LanguageDetector.sys.mjs b/toolkit/components/translation/LanguageDetector.sys.mjs
new file mode 100644
index 0000000000..1bc29fda2e
--- /dev/null
+++ b/toolkit/components/translation/LanguageDetector.sys.mjs
@@ -0,0 +1,192 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// workerManager is exported for tests.
+import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";
+
+const WORKER_URL = "resource://gre/modules/translation/cld-worker.js";
+
+/**
+ * The length of the substring to pull from the document's text for language
+ * identification.
+ *
+ * This value should ideally be one that is large enough to yield a confident
+ * identification result without being too large or expensive to extract.
+ *
+ * At this time, this value is not driven by statistical data or analysis.
+ *
+ * For the moment, while we investigate which language identification library
+ * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
+ */
+const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;
+
+export var workerManager = {
+  // Since Emscripten can handle heap growth, but not heap shrinkage, we
+  // need to refresh the worker after we've processed a particularly large
+  // string in order to prevent unnecessary resident memory growth.
+  //
+  // These values define the cut-off string length and the idle timeout
+  // (in milliseconds) before destroying a worker. Once a string of the
+  // maximum size has been processed, the worker is marked for
+  // destruction, and is terminated as soon as it has been idle for the
+  // given timeout.
+  //
+  // 1.5MB. This is the approximate string length that forces heap growth
+  // for a 2MB heap.
+  LARGE_STRING: 1.5 * 1024 * 1024,
+  IDLE_TIMEOUT: 10 * 1000,
+
+  detectionQueue: [],
+
+  detectLanguage(aParams) {
+    return this.workerReady
+      .then(worker => {
+        return new Promise(resolve => {
+          this.detectionQueue.push({ resolve });
+          worker.postMessage(aParams);
+        });
+      })
+      .then(result => {
+        // We have our asynchronous result from the worker.
+        //
+        // Determine if our input was large enough to trigger heap growth,
+        // or if we're already waiting to destroy the worker when it's
+        // idle. If so, schedule termination after the idle timeout.
+        if (
+          aParams.text.length >= this.LARGE_STRING ||
+          this._idleTimeout != null
+        ) {
+          this.flushWorker();
+        }
+
+        return result;
+      });
+  },
+
+  _worker: null,
+  _workerReadyPromise: null,
+
+  get workerReady() {
+    if (!this._workerReadyPromise) {
+      this._workerReadyPromise = new Promise(resolve => {
+        let worker = new Worker(WORKER_URL);
+        worker.onmessage = aMsg => {
+          if (aMsg.data == "ready") {
+            resolve(worker);
+          } else {
+            this.detectionQueue.shift().resolve(aMsg.data);
+          }
+        };
+        this._worker = worker;
+      });
+    }
+
+    return this._workerReadyPromise;
+  },
+
+  // Holds the ID of the current pending idle cleanup setTimeout.
+  _idleTimeout: null,
+
+  // Schedule the current worker to be terminated after the idle timeout.
+  flushWorker() {
+    if (this._idleTimeout != null) {
+      clearTimeout(this._idleTimeout);
+    }
+
+    this._idleTimeout = setTimeout(
+      this._flushWorker.bind(this),
+      this.IDLE_TIMEOUT
+    );
+  },
+
+  // Immediately terminate the worker, as long as there no pending
+  // results. Otherwise, reschedule termination until after the next
+  // idle timeout.
+  _flushWorker() {
+    if (this.detectionQueue.length) {
+      this.flushWorker();
+    } else {
+      if (this._worker) {
+        this._worker.terminate();
+      }
+
+      this._worker = null;
+      this._workerReadyPromise = null;
+      this._idleTimeout = null;
+    }
+  },
+};
+
+export var LanguageDetector = {
+  /**
+   * Detect the language of a given string.
+   *
+   * The argument may be either a string containing the text to analyze,
+   * or an object with the following properties:
+   *
+   *  - 'text' The text to analyze.
+   *
+   *  - 'isHTML' (optional) A boolean, indicating whether the text
+   *      should be analyzed as HTML rather than plain text.
+   *
+   *  - 'language' (optional) A string indicating the expected language.
+   *      For text extracted from HTTP documents, this is expected to
+   *      come from the Content-Language header.
+   *
+   *  - 'tld' (optional) A string indicating the top-level domain of the
+   *      document the text was extracted from.
+   *
+   *  - 'encoding' (optional) A string describing the encoding of the
+   *      document the string was extracted from. Note that, regardless
+   *      of the value of this property, the 'text' property must be a
+   *      UTF-16 JavaScript string.
+   *
+   * @returns {Promise<Object>}
+   * @resolves When detection is finished, with a object containing
+   * these fields:
+   *  - 'language' (string with a language code)
+   *  - 'confident' (boolean) Whether the detector is confident of the
+   *      result.
+   *  - 'languages' (array) An array of up to three elements, containing
+   *      the most prevalent languages detected. It contains a
+   *      'languageCode' property, containing the ISO language code of
+   *      the language, and a 'percent' property, describing the
+   *      approximate percentage of the input which is in that language.
+   *      For text of an unknown language, the result may contain an
+   *      entry with the languge code 'un', indicating the percent of
+   *      the text which is unknown.
+   */
+  detectLanguage(aParams) {
+    if (typeof aParams == "string") {
+      aParams = { text: aParams };
+    }
+
+    return workerManager.detectLanguage(aParams);
+  },
+
+  /**
+   * Attempts to determine the language in which the document's content is written.
+   *
+   * For the moment, while we investigate which language identification library
+   * we would like to use, keep this logic in sync with language-id-engine.sys.mjs
+   * @returns {string | null}
+   */
+  async detectLanguageFromDocument(aDocument) {
+    // Grab a selection of text.
+    let encoder = Cu.createDocumentEncoder("text/plain");
+    encoder.init(aDocument, "text/plain", encoder.SkipInvisibleContent);
+    let text = encoder
+      .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
+      .replaceAll("\r", "")
+      .replaceAll("\n", " ");
+
+    const { language, confident } = await workerManager.detectLanguage({
+      text,
+    });
+
+    workerManager.flushWorker();
+
+    return confident ? language : null;
+  },
+};
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
commit	26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree	f435a8308119effd964b339f76abb83a57c29483 /toolkit/components/translation/LanguageDetector.sys.mjs
parent	Initial commit. (diff)
download	firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip