/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ import { clearTimeout, setTimeout } from "resource://gre/modules/Timer.sys.mjs"; const WORKER_URL = "resource://gre/modules/translations/cld-worker.js"; /** * The options used for when detecting a language. * * @typedef {object} DetectionOptions * * @property {string} text - The text to analyze. * @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as * HTML rather than plain text. * @property {string} [language] - A string indicating the expected language. For text * extracted from HTTP documents, this is expected to come from the Content-Language * header. * @property {string} [tld] - A string indicating the top-level domain of the document the * text was extracted from. * @property {string} [encoding] - A string describing the encoding of the document the * string was extracted from. Note that, regardless of the value of this property, * the 'text' property must be a UTF-16 JavaScript string. */ /** * A larger web document can be composed of multiple languages. This object details the * breakdown of what languages are present in the document, and at what percentages. * For instance a document could be 70% English and 30% French: * * [ * { language: "en", percentage: 70 }, * { language: "fr", percentage: 30 }, * ] * * @typedef {object} MultilingualSection * @property {string} language - BCP 47 language tag, or "un" for unknown. * @property {number} percent - The integral percentage ranged 0-100. */ /** * @typedef {object} DetectionResult * @property {string} language - The language code * @property {boolean} confident - Whether the detector is confident of the result. * @property {Array} languages - The list of languages detected in * multilingual content. This is between 0 and 3 languages. */ /** * The length of the substring to pull from the document's text for language * identification. * * This value should ideally be one that is large enough to yield a confident * identification result without being too large or expensive to extract. * * At this time, this value is not driven by statistical data or analysis. */ const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024; /** * The shorter the text, the less confidence we should have in the result of the language * identification. Add another heuristic to report the ID as not confident if the length * of the code points of the text is less than this threshold. * * This was determined by plotting a kernel density estimation of the number of times the * source language had to be changed in the SelectTranslationsPanel vs. the code units in * the source text. * * 0013 code units or less - 49.5% of language changes * 0036 code units or less - 74.9% of language changes * 0153 code units or less - 90.0% of language changes * 0200 code units or less - 91.5% of language changes * 0427 code units or less - 95.0% of language changes * 1382 code units or less - 98.0% of language changes * 3506 code units or less - 99.0% of language changes */ const DOC_CONFIDENCE_THRESHOLD = 200; /** * An internal class to manage communicating to the worker, and managing its lifecycle. * It's initialized once below statically to the module. */ class WorkerManager { // Since Emscripten can handle heap growth, but not heap shrinkage, we need to refresh // the worker after we've processed a particularly large string in order to prevent // unnecessary resident memory growth. // // These values define the cut-off string length and the idle timeout (in milliseconds) // before destroying a worker. Once a string of the maximum size has been processed, // the worker is marked for destruction, and is terminated as soon as it has been idle // for the given timeout. // // 1.5MB. This is the approximate string length that forces heap growth for a 2MB heap. LARGE_STRING = 1.5 * 1024 * 1024; IDLE_TIMEOUT = 10_000; /** * Resolvers for the detection queue. * * @type {Array<(result: DetectionResult) => void>} */ detectionQueue = []; /** * @type {Worker | null} */ worker = null; /** * @type {Promise | null} */ workerPromise = null; /** * Holds the ID of the current pending idle cleanup setTimeout. * * @type {number | null} */ idleTimeoutId = null; /** * @param {DetectionOptions} options * @returns {Promise} */ async detectLanguage(options) { const worker = await this.getWorker(); const result = await new Promise(resolve => { this.detectionQueue.push(resolve); worker.postMessage(options); }); // We have our asynchronous result from the worker. // // Determine if our input was large enough to trigger heap growth, // or if we're already waiting to destroy the worker when it's // idle. If so, schedule termination after the idle timeout. if ( options.text.length >= this.LARGE_STRING || this.idleTimeoutId != null ) { this.flushWorker(); } return result; } /** * @returns {Promise} */ getWorker() { if (!this.workerPromise) { this.workerPromise = new Promise(resolve => { let worker = new Worker(WORKER_URL); worker.onmessage = message => { if (message.data == "ready") { resolve(worker); } else { /** @type {DetectionResult} */ const detectionResult = message.data; const resolver = this.detectionQueue.shift(); resolver(detectionResult); } }; this.worker = worker; }); } return this.workerPromise; } /** * Schedule the current worker to be terminated after the idle timeout. */ flushWorker() { if (this.idleTimeoutId != null) { clearTimeout(this.idleTimeoutId); } this.idleTimeoutId = setTimeout(() => { if (this.detectionQueue.length) { // Reschedule the termination as something else was added to the queue. this.flushWorker(); } else { // Terminate the worker. if (this.worker) { this.worker.terminate(); } this.worker = null; this.workerPromise = null; this.idleTimeoutId = null; } }, this.IDLE_TIMEOUT); } } /** * The worker manager is static to this module. Exported it for unit testing. */ export const workerManager = new WorkerManager(); /** * */ export class LanguageDetector { /** * Detect the language of a given string. * * @param {DetectionOptions | string} options - Either the text to analyze, * or the options. * @returns {Promise} */ static detectLanguage(options) { if (typeof options == "string") { options = { text: options }; } return workerManager.detectLanguage(options); } /** * Attempts to determine the language in which the document's content is written. * * @param {Document} document * @returns {DetectionResult} */ static async detectLanguageFromDocument(document) { // Grab a selection of text. let encoder = Cu.createDocumentEncoder("text/plain"); encoder.init(document, "text/plain", encoder.SkipInvisibleContent); let text = encoder .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH) .replaceAll("\r", "") .replaceAll("\n", " "); const result = await workerManager.detectLanguage({ text, }); if (text.length < DOC_CONFIDENCE_THRESHOLD) { result.confident = false; } return result; } }