firefox/toolkit/components/translations/LanguageDetector.sys.mjs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs";

const WORKER_URL = "resource://gre/modules/translations/cld-worker.js";

/**
 * The options used for when detecting a language.
 *
 * @typedef {object} DetectionOptions
 *
 * @property {string} text - The text to analyze.
 * @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as
 *     HTML rather than plain text.
 * @property {string} [language] - A string indicating the expected language. For text
 *     extracted from HTTP documents, this is expected to come from the Content-Language
 *     header.
 * @property {string} [tld] - A string indicating the top-level domain of the document the
 *     text was extracted from.
 * @property {string} [encoding] - A string describing the encoding of the document the
 *     string was extracted from. Note that, regardless of the value of this property,
 *     the 'text' property must be a UTF-16 JavaScript string.
 */

/**
 * A larger web document can be composed of multiple languages. This object details the
 * breakdown of what languages are present in the document, and at what percentages.
 * For instance a document could be 70% English and 30% French:
 *
 *   [
 *      { language: "en", percentage: 70 },
 *      { language: "fr", percentage: 30 },
 *   ]
 *
 * @typedef {object} MultilingualSection
 * @property {string} language - BCP 47 language tag, or "un" for unknown.
 * @property {number} percent - The integral percentage ranged 0-100.
 */

/**
 * @typedef {object} DetectionResult
 * @property {string} language - The language code
 * @property {boolean} confident - Whether the detector is confident of the result.
 * @property {Array<MultilingualSection>} languages - The list of languages detected in
 *     multilingual content. This is between 0 and 3 languages.
 */

/**
 * The length of the substring to pull from the document's text for language
 * identification.
 *
 * This value should ideally be one that is large enough to yield a confident
 * identification result without being too large or expensive to extract.
 *
 * At this time, this value is not driven by statistical data or analysis.
 */
const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024;

/**
 * The shorter the text, the less confidence we should have in the result of the language
 * identification. Add another heuristic to report the ID as not confident if the length
 * of the code points of the text is less than this threshold.
 *
 * This was determined by plotting a kernel density estimation of the number of times the
 * source language had to be changed in the SelectTranslationsPanel vs. the code units in
 * the source text.
 *
 * 0013 code units or less - 49.5% of language changes
 * 0036 code units or less - 74.9% of language changes
 * 0153 code units or less - 90.0% of language changes
 * 0200 code units or less - 91.5% of language changes
 * 0427 code units or less - 95.0% of language changes
 * 1382 code units or less - 98.0% of language changes
 * 3506 code units or less - 99.0% of language changes
 */
const DOC_CONFIDENCE_THRESHOLD = 200;

/**
 * An internal class to manage communicating to the worker, and managing its lifecycle.
 * It's initialized once below statically to the module.
 */
class WorkerManager {
  // Since Emscripten can handle heap growth, but not heap shrinkage, we need to refresh
  // the worker after we've processed a particularly large string in order to prevent
  // unnecessary resident memory growth.
  //
  // These values define the cut-off string length and the idle timeout (in milliseconds)
  // before destroying a worker. Once a string of the maximum size has been processed,
  // the worker is marked for destruction, and is terminated as soon as it has been idle
  // for the given timeout.
  //
  // 1.5MB. This is the approximate string length that forces heap growth for a 2MB heap.
  LARGE_STRING = 1.5 * 1024 * 1024;
  IDLE_TIMEOUT = 10_000;

  /**
   * Resolvers for the detection queue.
   *
   * @type {Array<(result: DetectionResult) => void>}
   */
  detectionQueue = [];

  /**
   * @type {Worker | null}
   */
  worker = null;

  /**
   * @type {Promise<Worker> | null}
   */
  workerPromise = null;

  /**
   * Holds the ID of the current pending idle cleanup setTimeout.
   *
   * @type {number | null}
   */
  idleTimeoutId = null;

  /**
   * @param {DetectionOptions} options
   * @returns {Promise<DetectionResult>}
   */
  async detectLanguage(options) {
    const worker = await this.getWorker();

    const result = await new Promise(resolve => {
      this.detectionQueue.push(resolve);
      worker.postMessage(options);
    });

    // We have our asynchronous result from the worker.
    //
    // Determine if our input was large enough to trigger heap growth,
    // or if we're already waiting to destroy the worker when it's
    // idle. If so, schedule termination after the idle timeout.
    if (
      options.text.length >= this.LARGE_STRING ||
      this.idleTimeoutId != null
    ) {
      this.flushWorker();
    }

    return result;
  }

  /**
   * @returns {Promise<Worker>}
   */
  getWorker() {
    if (!this.workerPromise) {
      this.workerPromise = new Promise(resolve => {
        let worker = new Worker(WORKER_URL);
        worker.onmessage = message => {
          if (message.data == "ready") {
            resolve(worker);
          } else {
            /** @type {DetectionResult} */
            const detectionResult = message.data;

            const resolver = this.detectionQueue.shift();
            resolver(detectionResult);
          }
        };
        this.worker = worker;
      });
    }

    return this.workerPromise;
  }

  /**
   * Schedule the current worker to be terminated after the idle timeout.
   */
  flushWorker() {
    if (this.idleTimeoutId != null) {
      clearTimeout(this.idleTimeoutId);
    }

    this.idleTimeoutId = setTimeout(() => {
      if (this.detectionQueue.length) {
        // Reschedule the termination as something else was added to the queue.
        this.flushWorker();
      } else {
        // Terminate the worker.
        if (this.worker) {
          this.worker.terminate();
        }

        this.worker = null;
        this.workerPromise = null;
        this.idleTimeoutId = null;
      }
    }, this.IDLE_TIMEOUT);
  }
}

/**
 * The worker manager is static to this module. Exported it for unit testing.
 */
export const workerManager = new WorkerManager();

/**
 *
 */
export class LanguageDetector {
  /**
   * Detect the language of a given string.
   *
   * @param {DetectionOptions | string} options - Either the text to analyze,
   *     or the options.
   * @returns {Promise<DetectionResult>}
   */
  static detectLanguage(options) {
    if (typeof options == "string") {
      options = { text: options };
    }

    return workerManager.detectLanguage(options);
  }

  /**
   * Attempts to determine the language in which the document's content is written.
   *
   * @param {Document} document
   * @returns {DetectionResult}
   */
  static async detectLanguageFromDocument(document) {
    // Grab a selection of text.
    let encoder = Cu.createDocumentEncoder("text/plain");
    encoder.init(document, "text/plain", encoder.SkipInvisibleContent);
    let text = encoder
      .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH)
      .replaceAll("\r", "")
      .replaceAll("\n", " ");

    const result = await workerManager.detectLanguage({
      text,
    });

    if (text.length < DOC_CONFIDENCE_THRESHOLD) {
      result.confident = false;
    }
    return result;
  }
}