diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /toolkit/components/translations/content | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'toolkit/components/translations/content')
7 files changed, 3367 insertions, 0 deletions
diff --git a/toolkit/components/translations/content/language-id-engine-worker.js b/toolkit/components/translations/content/language-id-engine-worker.js new file mode 100644 index 0000000000..1323b505d2 --- /dev/null +++ b/toolkit/components/translations/content/language-id-engine-worker.js @@ -0,0 +1,327 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* eslint-env mozilla/chrome-worker */ +"use strict"; + +// Throw Promise rejection errors so that they are visible in the console. +self.addEventListener("unhandledrejection", event => { + throw event.reason; +}); + +/* global addOnPostRun FastText loadFastText */ +importScripts( + "chrome://global/content/translations/fasttext.js", + "chrome://global/content/translations/fasttext_wasm.js" +); + +/** + * The number of languages that should be returned when the model analyzes text. + * + * A value of 1 means only the most-likely language will be returned. + * A value of 5 would mean that the top 5 most-likely languages will be returned. + */ +const LANGUAGE_COUNT = 1; + +/** + * The threshold of likelihood in range [0.0, 1.0] that must pass + * for a language to be returned from the model. + * + * A value of 0.0 would mean that a language is always returned with any confidence. + * A value of 0.5 would mean that a language is only returned if the model + * is 50% confident that the analyzed text could be that language. + */ +const CONFIDENCE_THRESHOLD = 0.0; + +// Respect the preference "browser.translations.logLevel". +let _isLoggingEnabled = true; +function log(...args) { + if (_isLoggingEnabled) { + console.log("Translations:", ...args); + } +} + +// Wait for the initialization request. +addEventListener("message", handleInitializationMessage); + +/** + * Initialize the engine, and get it ready to handle language identification requests. + * The "initialize" message must be received before any other message handling + * requests will be processed. + * + * @param {Object} event + * @param {Object} event.data + * @param {string} event.data.type - The message type, expects "initialize". + * @param {ArrayBuffer} event.data.wasmBuffer - The buffer containing the wasm binary. + * @param {ArrayBuffer} event.data.modelBuffer - The buffer containing the language-id model binary. + * @param {null | string} event.data.mockedLangTag - The mocked language tag value (only present when mocking). + * @param {null | number} event.data.mockedConfidence - The mocked confidence value (only present when mocking). + * @param {boolean} event.data.isLoggingEnabled + */ +async function handleInitializationMessage({ data }) { + if (data.type !== "initialize") { + throw new Error( + "The LanguageIdEngine worker received a message before it was initialized." + ); + } + + try { + const { isLoggingEnabled } = data; + if (isLoggingEnabled) { + // Respect the "browser.translations.logLevel" preference. + _isLoggingEnabled = true; + } + + /** @type {LanguageIdEngine | MockedLanguageIdEngine} */ + let languageIdEngine; + const { mockedLangTag, mockedConfidence } = data; + if (mockedLangTag !== null && mockedConfidence !== null) { + // Don't actually use the engine as it is mocked. + languageIdEngine = new MockedLanguageIdEngine( + mockedLangTag, + mockedConfidence + ); + } else { + languageIdEngine = await initializeLanguageIdEngine(data); + } + + handleMessages(languageIdEngine); + postMessage({ type: "initialization-success" }); + } catch (error) { + console.error(error); + postMessage({ type: "initialization-error", error: error?.message }); + } + + removeEventListener("message", handleInitializationMessage); +} + +/** + * Initializes the fastText wasm runtime and returns the fastText model. + * + * @param {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary. + * @param {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary. + * @returns {FastTextModel} + */ +function initializeFastTextModel(modelBuffer, wasmBuffer) { + return new Promise((resolve, reject) => { + const initialModule = { + onAbort() { + reject(new Error("Error loading the fastText Wasm Module")); + }, + onRuntimeInitialized() { + addOnPostRun(() => { + const ft = new FastText(initialModule); + const model = ft.loadModelBinary(modelBuffer); + resolve(model); + }); + }, + wasmBinary: wasmBuffer, + }; + loadFastText(initialModule); + }); +} + +/** + * Initialize the LanguageIdEngine from the data payload by loading + * the fastText wasm runtime and model and constructing the engine. + * + * @param {Object} data + * @property {ArrayBuffer} data.wasmBuffer - The buffer containing the wasm binary. + * @property {ArrayBuffer} data.modelBuffer - The buffer containing the language-id model binary. + */ +async function initializeLanguageIdEngine(data) { + const { modelBuffer, wasmBuffer } = data; + if (!modelBuffer) { + throw new Error('LanguageIdEngine initialization missing "modelBuffer"'); + } + if (!wasmBuffer) { + throw new Error('LanguageIdEngine initialization missing "wasmBuffer"'); + } + const model = await initializeFastTextModel(modelBuffer, wasmBuffer); + return new LanguageIdEngine(model); +} + +/** + * Sets up the message handling for the worker. + * + * @param {LanguageIdEngine | MockedLanguageIdEngine} languageIdEngine + */ +function handleMessages(languageIdEngine) { + /** + * Handle any message after the initialization message. + * + * @param {Object} data + * @property {string} data.type - The message type. + * @property {string} data.message - The message text to identify the language of. + * @property {number} data.messageId - The ID of the message. + */ + addEventListener("message", ({ data }) => { + try { + if (data.type === "initialize") { + throw new Error( + "The language-identification engine must not be re-initialized." + ); + } + switch (data.type) { + case "language-id-request": { + const { message, messageId } = data; + try { + const [confidence, langTag] = + languageIdEngine.identifyLanguage(message); + postMessage({ + type: "language-id-response", + langTag, + confidence, + messageId, + }); + } catch (error) { + console.error(error); + postMessage({ + type: "language-id-error", + messageId, + }); + } + break; + } + default: { + console.warn("Unknown message type:", data.type); + } + } + } catch (error) { + // Ensure the unexpected errors are surfaced in the console. + console.error(error); + } + }); +} + +/** + * The LanguageIdEngine wraps around a machine-learning model that can identify text + * as being written in a given human language. The engine is responsible for invoking + * model and returning the language tag in the format that is expected by firefox + * translations code. + */ +class LanguageIdEngine { + /** @type {FastTextModel} */ + #model; + + /** + * @param {FastTextModel} model + */ + constructor(model) { + this.#model = model; + } + + /** + * Formats the language tag returned by the language-identification model to match + * conform to the format used internally by Firefox. + * + * This function is currently configured to handle the fastText language-identification + * model. Updating the language-identification model or moving to something other than + * fastText in the future will likely require updating this function. + * + * @param {string} langTag + * @returns {string} The correctly formatted langTag + */ + #formatLangTag(langTag) { + // The fastText language model returns values of the format "__label__{langTag}". + // As such, this function strips the "__label__" prefix, leaving only the langTag. + let formattedTag = langTag.replace("__label__", ""); + + // fastText is capable of returning any of a predetermined set of 176 langTags: + // https://fasttext.cc/docs/en/language-identification.html + // + // These tags come from ISO639-3: + // https://iso639-3.sil.org/code_tables/deprecated_codes/data + // + // Each of these tags have been cross checked for compatibility with the IANA + // language subtag registry, which is used by BCP 47, and any edge cases are handled below. + // https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry + switch (formattedTag) { + // fastText may return "eml" which is a deprecated ISO639-3 language tag for the language + // Emiliano-Romagnolo. It was split into two separate tags "egl" and "rgn": + // https://iso639-3.sil.org/request/2008-040 + // + // "eml" was once requested to be added to the IANA registry, but it was denied: + // https://www.alvestrand.no/pipermail/ietf-languages/2009-December/009754.html + // + // This case should return either "egl" or "rgn", given that the "eml" tag was split. + // However, given that the fastText model does not distinguish between the two by using + // the deprecated tag, this function will default to "egl" because it is alphabetically first. + // + // At such a time that Firefox Translations may support either of these languages, we should consider + // a way to further distinguish between the two languages at that time. + case "eml": { + formattedTag = "egl"; + break; + } + // The fastText model returns "no" for Norwegian Bokmål. + // + // According to advice from https://r12a.github.io/app-subtags/ + // "no" is a macro language that encompasses the following more specific primary language subtags: "nb" "nn". + // It is recommended to use more specific language subtags as long as it does not break legacy usage of an application. + // As such, this function will return "nb" for Norwegian Bokmål instead of "no" as reported by fastText. + case "no": { + formattedTag = "nb"; + break; + } + } + return formattedTag; + } + + /** + * Identifies the human language in which the message is written and returns + * the BCP 47 language tag of the language it is determined to be along along + * with a rating of how confident the model is that the label is correct. + * + * @param {string} message + * @returns {Array<number | string>} An array containing the confidence and language tag. + * The confidence is a number between 0 and 1, representing a percentage. + * The language tag is a BCP 47 language tag such as "en" for English. + * + * e.g. [0.87, "en"] + */ + identifyLanguage(message) { + const mostLikelyLanguageData = this.#model + .predict(message.trim(), LANGUAGE_COUNT, CONFIDENCE_THRESHOLD) + .get(0); + + // This should never fail as long as + // LANGUAGE_COUNT > 1 && CONFIDENCE_THRESHOLD === 0.0 + if (!mostLikelyLanguageData) { + throw new Error("Unable to identify a language"); + } + + const [confidence, langTag] = mostLikelyLanguageData; + return [confidence, this.#formatLangTag(langTag)]; + } +} + +/** + * For testing purposes, provide a fully mocked engine. This allows for easy integration + * testing of the UI, without having to rely on downloading remote models and remote + * wasm binaries. + */ +class MockedLanguageIdEngine { + /** @type {string} */ + #langTag; + /** @type {number} */ + #confidence; + + /** + * @param {string} langTag + * @param {number} confidence + */ + constructor(langTag, confidence) { + this.#langTag = langTag; + this.#confidence = confidence; + } + + /** + * Mocks identifying a language by returning the mocked engine's pre-determined + * language tag and confidence values. + */ + identifyLanguage(_message) { + return [this.#confidence, this.#langTag]; + } +} diff --git a/toolkit/components/translations/content/simd-detect-worker.js b/toolkit/components/translations/content/simd-detect-worker.js new file mode 100644 index 0000000000..35efce5e25 --- /dev/null +++ b/toolkit/components/translations/content/simd-detect-worker.js @@ -0,0 +1,42 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +let isSimdSupported = false; + +/** + * WebAssembly counts as unsafe eval in privileged contexts, so we have to execute this + * code in a ChromeWorker. The code feature detects SIMD support. The comment above + * the binary code is the .wat version of the .wasm binary. + */ + +try { + new WebAssembly.Module( + new Uint8Array( + // ``` + // ;; Detect SIMD support. + // ;; Compile by running: wat2wasm --enable-all simd-detect.wat + // + // (module + // (func (result v128) + // i32.const 0 + // i8x16.splat + // i8x16.popcnt + // ) + // ) + // ``` + + // prettier-ignore + [ + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, 0x01, 0x05, 0x01, 0x60, 0x00, + 0x01, 0x7b, 0x03, 0x02, 0x01, 0x00, 0x0a, 0x0a, 0x01, 0x08, 0x00, 0x41, 0x00, + 0xfd, 0x0f, 0xfd, 0x62, 0x0b + ] + ) + ); + isSimdSupported = true; +} catch (error) { + console.error(`Translations: SIMD not supported`, error); +} + +postMessage({ isSimdSupported }); diff --git a/toolkit/components/translations/content/translations-document.sys.mjs b/toolkit/components/translations/content/translations-document.sys.mjs new file mode 100644 index 0000000000..c1c883dbc8 --- /dev/null +++ b/toolkit/components/translations/content/translations-document.sys.mjs @@ -0,0 +1,1284 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + setTimeout: "resource://gre/modules/Timer.sys.mjs", +}); + +XPCOMUtils.defineLazyGetter(lazy, "console", () => { + return console.createInstance({ + maxLogLevelPref: "browser.translations.logLevel", + prefix: "Translations", + }); +}); + +/** + * Map the NodeFilter enums that are used by the TreeWalker into enums that make + * sense for determining the status of the nodes for the TranslationsDocument process. + * This aligns the meanings of the filtering for the translations process. + */ +const NodeStatus = { + // This node is ready to translate as is. + READY_TO_TRANSLATE: NodeFilter.FILTER_ACCEPT, + + // This node contains too many block elements and needs to be subdivided further. + SUBDIVIDE_FURTHER: NodeFilter.FILTER_SKIP, + + // This node should not be considered for translation. + NOT_TRANSLATABLE: NodeFilter.FILTER_REJECT, +}; + +/** + * @typedef {import("../translations").NodeVisibility} NodeVisibility + * @typedef {(message: string) => Promise<string>} TranslationFunction + */ + +/** + * How often the DOM is updated with translations, in milliseconds. + */ +const DOM_UPDATE_INTERVAL_MS = 50; + +/** + * These tags are excluded from translation. + */ +const EXCLUDED_TAGS = new Set([ + // The following are elements that semantically should not be translated. + "CODE", + "KBD", + "SAMP", + "VAR", + "ACRONYM", + + // The following are deprecated tags. + "DIR", + "APPLET", + + // The following are embedded elements, and are not supported (yet). + "SVG", + "MATH", + "EMBED", + "OBJECT", + "IFRAME", + + // These are elements that are treated as opaque by Firefox which causes their + // innerHTML property to be just the raw text node behind it. Any text that is sent as + // HTML must be valid, and there is no guarantee that the innerHTML is valid. + "NOSCRIPT", + "NOEMBED", + "NOFRAMES", + + // The title is handled separately, and a HEAD tag should not be considered. + "HEAD", + + // These are not user-visible tags. + "STYLE", + "SCRIPT", + "TEMPLATE", + + // Textarea elements contain user content, which should not be translated. + "TEXTAREA", +]); + +// Tags that are treated as assumed inline. This list has been created by heuristics +// and excludes some commonly inline tags, due to how they are used practically. +// +// An actual list of inline elements is available here: +// https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements +const INLINE_TAGS = new Set([ + "ABBR", + "B", + "CODE", + "DEL", + "EM", + "I", + "INS", + "KBD", + "MARK", + "MATH", + "OUTPUT", + "Q", + "RUBY", + "SMALL", + "STRONG", + "SUB", + "SUP", + "TIME", + "U", + "VAR", + "WBR", + + // These are not really inline, but bergamot-translator treats these as + // sentence-breaking. + "BR", + "TD", + "TH", + "LI", +]); + +/** + * Tags that can't reliably be assumed to be inline or block elements. They default + * to inline, but are often used as block elements. + */ +const GENERIC_TAGS = new Set(["A", "SPAN"]); + +/** + * This class manages the process of translating the DOM from one language to another. + * A translateHTML and a translateText function are injected into the constructor. This + * class is responsible for subdividing a Node into small enough pieces to where it + * contains a reasonable amount of text and inline elements for the translations engine + * to translate. Once a node has been identified as a small enough chunk, its innerHTML + * is read, and sent for translation. The async translation result comes back as an HTML + * string. The DOM node is updated with the new text and potentially changed DOM ordering. + * + * This class also handles mutations of the DOM and will translate nodes as they are added + * to the page, or the when the node's text is changed by content scripts. + */ +export class TranslationsDocument { + /** + * The BCP 47 language tag that is used on the page. + * + * @type {string} */ + documentLanguage; + + /** + * The timeout between the first translation received and the call to update the DOM + * with translations. + */ + #updateTimeout = null; + + /** + * The nodes that need translations. They are queued when the document tree is walked, + * and then they are dispatched for translation based on their visibility. The viewport + * nodes are given the highest priority. + * + * @type {Map<Node, NodeVisibility>} + */ + #queuedNodes = new Map(); + + /** + * The count of how many pending translations have been sent to the translations + * engine. + */ + #pendingTranslationsCount = 0; + + /** + * The list of nodes that need updating with the translated HTML. These are batched + * into an update. + * + * @type {Set<{ node: Node, translatedHTML: string }} + */ + #nodesWithTranslatedHTML = new Set(); + + /** + * The set of nodes that have been subdivided and processed for translation. They + * should not be submitted again unless their contents have been changed. + * + * @type {WeakSet<Node>} + */ + #processedNodes = new WeakSet(); + + /** + * All root elements we're trying to translate. This should be the `document.body` + * and the the `title` element. + * + * @type {Set<Node>} + */ + #rootNodes = new Set(); + + /** + * This promise gets resolved when the initial viewport translations are done. + * This is a key user-visible performance metric. It represents what the user + * actually sees. + * + * @type {Promise<void> | null} + */ + viewportTranslated = null; + + /** + * Construct a new TranslationsDocument. It is tied to a specific Document and cannot + * be re-used. The translation functions are injected since this class shouldn't + * manage the life cycle of the translations engines. + * + * @param {Document} document + * @param {string} documentLanguage - The BCP 47 language tag. + * @param {number} innerWindowId - This is used for better profiler marker reporting. + * @param {TranslationFunction} translateHTML + * @param {TranslationFunction} translateText + */ + constructor( + document, + documentLanguage, + innerWindowId, + translateHTML, + translateText + ) { + /** + * The language of the document. If elements are found that do not match this language, + * then they are skipped. + * + * @type {string} + */ + this.documentLanguage = documentLanguage; + if (documentLanguage.length !== 2) { + throw new Error( + "Expected the language to be a valid 2 letter BCP 47 language tag: " + + documentLanguage + ); + } + + /** @type {TranslationFunction} */ + this.translateHTML = translateHTML; + + /** @type {TranslationFunction} */ + this.translateText = translateText; + + /** @type {number} */ + this.innerWindowId = innerWindowId; + + /** @type {DOMParser} */ + this.domParser = new document.ownerGlobal.DOMParser(); + + /** + * This selector runs to find child nodes that should be excluded. It should be + * basically the same implementation of `isExcludedNode`, but as a selector. + * + * @type {string} + */ + this.excludedNodeSelector = [ + // Use: [lang|=value] to match language codes. + // + // Per: https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors + // + // The elements with an attribute name of attr whose value can be exactly + // value or can begin with value immediately followed by a hyphen, - (U+002D). + // It is often used for language subcode matches. + `[lang]:not([lang|="${this.documentLanguage}"])`, + `[translate=no]`, + `.notranslate`, + `[contenteditable="true"]`, + `[contenteditable=""]`, + [...EXCLUDED_TAGS].join(","), + ].join(","); + + this.observer = new document.ownerGlobal.MutationObserver(mutationsList => { + for (const mutation of mutationsList) { + switch (mutation.type) { + case "childList": + for (const node of mutation.addedNodes) { + this.#processedNodes.delete(node); + this.subdivideNodeForTranslations(node); + } + break; + case "characterData": + this.#processedNodes.delete(mutation); + this.subdivideNodeForTranslations(mutation.target); + break; + default: + break; + } + } + }); + } + + /** + * Add a new element to start translating. This root is tracked for mutations and + * kept up to date with translations. This will be the body element and title tag + * for the document. + * + * @param {Element} [node] + */ + addRootElement(node) { + if (!node) { + return; + } + + if (node.nodeType !== Node.ELEMENT_NODE) { + // This node is not an element, do not add it. + return; + } + + if (this.#rootNodes.has(node)) { + // Exclude nodes that are already targetted. + return; + } + + this.#rootNodes.add(node); + + this.subdivideNodeForTranslations(node); + + this.observer.observe(node, { + characterData: true, + childList: true, + subtree: true, + }); + } + + /** + * Start walking down through a node's subtree and decide which nodes to queue for + * translation. This first node could be the root nodes of the DOM, such as the + * document body, or the title element, or it could be a mutation target. + * + * The nodes go through a process of subdivision until an appropriate sized chunk + * of inline text can be found. + * + * @param {Node} node + */ + subdivideNodeForTranslations(node) { + if (!this.#rootNodes.has(node)) { + // This is a non-root node, which means it came from a mutation observer. + // Ensure that it is a valid node to translate by checking all of its ancestors. + for (let parent of getAncestorsIterator(node)) { + if ( + this.determineTranslationStatus(parent) === + NodeStatus.NOT_TRANSLATABLE + ) { + return; + } + } + } + + switch (this.determineTranslationStatusForUnprocessedNodes(node)) { + case NodeStatus.NOT_TRANSLATABLE: + // This node is rejected as it shouldn't be translated. + return; + + case NodeStatus.READY_TO_TRANSLATE: + // This node is ready for translating, and doesn't need to be subdivided. There + // is no reason to run the TreeWalker, it can be directly submitted for + // translation. + this.queueNodeForTranslation(node); + break; + + case NodeStatus.SUBDIVIDE_FURTHER: + // This node may be translatable, but it needs to be subdivided into smaller + // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes + // that contain enough inline elements to send to be translated. + { + const nodeIterator = node.ownerDocument.createTreeWalker( + node, + NodeFilter.SHOW_ELEMENT, + this.determineTranslationStatusForUnprocessedNodes + ); + + // This iterator will contain each node that has been subdivided enough to + // be translated. + let currentNode; + while ((currentNode = nodeIterator.nextNode())) { + this.queueNodeForTranslation(currentNode); + } + } + break; + } + + if (node.nodeName === "BODY") { + this.reportWordsInViewport(); + } + this.dispatchQueuedTranslations(); + } + + /** + * Test whether this is an element we do not want to translate. These are things like + * <code> elements, elements with a different "lang" attribute, and elements that + * have a `translate=no` attribute. + * + * @param {Node} node + */ + isExcludedNode(node) { + // Property access be expensive, so destructure required properties so they are + // not accessed multiple times. + const { nodeType } = node; + + if (nodeType === Node.TEXT_NODE) { + // Text nodes are never excluded. + return false; + } + if (nodeType !== Node.ELEMENT_NODE) { + // Only elements and and text nodes should be considered. + return true; + } + + const { nodeName } = node; + + if (EXCLUDED_TAGS.has(nodeName)) { + // This is an excluded tag. + return true; + } + + if (!this.matchesDocumentLanguage(node)) { + // Exclude nodes that don't match the fromLanguage. + return true; + } + + if (node.getAttribute("translate") === "no") { + // This element has a translate="no" attribute. + // https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/translate + return true; + } + + if (node.classList.contains("notranslate")) { + // Google Translate skips translations if the classList contains "notranslate" + // https://cloud.google.com/translate/troubleshooting + return true; + } + + if (node.isContentEditable) { + // This field is editable, and so exclude it similar to the way that form input + // fields are excluded. + return true; + } + + return false; + } + + /** + * Runs `determineTranslationStatus`, but only on unprocessed nodes. + * + * @param {Node} node + * @return {number} - One of the NodeStatus values. + */ + determineTranslationStatusForUnprocessedNodes = node => { + if (this.#processedNodes.has(node)) { + // Skip nodes that have already been processed. + return NodeStatus.NOT_TRANSLATABLE; + } + + return this.determineTranslationStatus(node); + }; + + /** + * Determines if a node should be submitted for translation, not translatable, or if + * it should be subdivided further. It doesn't check if the node has already been + * processed. + * + * The return result works as a TreeWalker NodeFilter as well. + * + * @param {Node} node + * @returns {number} - One of the `NodeStatus` values. See that object + * for documentation. These values match the filters for the TreeWalker. + * These values also work as a `NodeFilter` value. + */ + determineTranslationStatus(node) { + if (isNodeQueued(node, this.#queuedNodes)) { + // This node or its parent was already queued, reject it. + return NodeStatus.NOT_TRANSLATABLE; + } + + if (this.isExcludedNode(node)) { + // This is an explicitly excluded node. + return NodeStatus.NOT_TRANSLATABLE; + } + + if (node.textContent.trim().length === 0) { + // Do not use subtrees that are empty of text. This textContent call is fairly + // expensive. + return NodeStatus.NOT_TRANSLATABLE; + } + + if (nodeNeedsSubdividing(node)) { + // Skip this node, and dig deeper into its tree to cut off smaller pieces + // to translate. It is presumed to be a wrapper of block elements. + return NodeStatus.SUBDIVIDE_FURTHER; + } + + if ( + containsExcludedNode(node, this.excludedNodeSelector) && + !hasTextNodes(node) + ) { + // Skip this node, and dig deeper into its tree to cut off smaller pieces + // to translate. + return NodeStatus.SUBDIVIDE_FURTHER; + } + + // This node can be treated as entire block to submit for translation. + return NodeStatus.READY_TO_TRANSLATE; + } + + /** + * Queue a node for translation. + * @param {Node} node + */ + queueNodeForTranslation(node) { + /** @type {NodeVisibility} */ + let visibility = "out-of-viewport"; + if (isNodeHidden(node)) { + visibility = "hidden"; + } else if (isNodeInViewport(node)) { + visibility = "in-viewport"; + } + + this.#queuedNodes.set(node, visibility); + } + + /** + * Submit the translations giving priority to nodes in the viewport. + */ + async dispatchQueuedTranslations() { + let inViewportCounts = 0; + let outOfViewportCounts = 0; + let hiddenCounts = 0; + + let inViewportTranslations; + if (!this.viewportTranslated) { + inViewportTranslations = []; + } + + for (const [node, visibility] of this.#queuedNodes) { + if (visibility === "in-viewport") { + inViewportCounts++; + const promise = this.submitTranslation(node); + if (inViewportTranslations) { + inViewportTranslations.push(promise); + } + } + } + for (const [node, visibility] of this.#queuedNodes) { + if (visibility === "out-of-viewport") { + outOfViewportCounts++; + this.submitTranslation(node); + } + } + for (const [node, visibility] of this.#queuedNodes) { + if (visibility === "hidden") { + hiddenCounts++; + this.submitTranslation(node); + } + } + + ChromeUtils.addProfilerMarker( + "Translations", + { innerWindowId: this.innerWindowId }, + `Translate ${this.#queuedNodes.size} nodes.\n\n` + + `In viewport: ${inViewportCounts}\n` + + `Out of viewport: ${outOfViewportCounts}\n` + + `Hidden: ${hiddenCounts}\n` + ); + + this.#queuedNodes.clear(); + + if (!this.viewportTranslated && inViewportTranslations) { + // Provide a promise that can be used to determine when the initial viewport has + // been translated. This is a key user-visible metric. + this.viewportTranslated = Promise.allSettled(inViewportTranslations); + } + } + + /** + * Record how many words were in the viewport, as this is the most important + * user-visible translation content. + */ + reportWordsInViewport() { + if ( + // This promise gets created for the first dispatchQueuedTranslations + this.viewportTranslated || + this.#queuedNodes.size === 0 + ) { + return; + } + + // TODO(Bug 1814195) - Add telemetry. + // TODO(Bug 1820618) - This whitespace regex will not work in CJK-like languages. + // This requires a segmenter for a proper implementation. + + const whitespace = /\s+/; + let wordCount = 0; + for (const [node, visibility] of this.#queuedNodes) { + if (visibility === "in-viewport") { + wordCount += node.textContent.trim().split(whitespace).length; + } + } + + const message = wordCount + " words are in the viewport."; + lazy.console.log(message); + ChromeUtils.addProfilerMarker( + "Translations", + { innerWindowId: this.innerWindowId }, + message + ); + } + + /** + * Submit a node for translation to the translations engine. + * + * @param {Node} node + * @returns {Promise<void>} + */ + async submitTranslation(node) { + // Give each element an id that gets passed through the translation so it can be + // reunited later on. + if (node.nodeType === Node.ELEMENT_NODE) { + node.querySelectorAll("*").forEach((el, i) => { + el.dataset.mozTranslationsId = i; + }); + } + + let text, translate; + if (node.nodeType === Node.ELEMENT_NODE) { + text = node.innerHTML; + translate = this.translateHTML; + } else { + text = node.textContent; + translate = this.translateText; + } + + if (text.trim().length === 0) { + return; + } + + // Mark this node as not to be translated again unless the contents are changed + // (which the observer will pick up on) + this.#processedNodes.add(node); + + this.#pendingTranslationsCount++; + try { + const [translatedHTML] = await translate(text); + this.#pendingTranslationsCount--; + this.scheduleNodeUpdateWithTranslation(node, translatedHTML); + } catch (error) { + this.#pendingTranslationsCount--; + lazy.console.error("Translation failed", error); + } + } + + /** + * Start the mutation observer, for instance after applying the translations to the DOM. + */ + startMutationObserver() { + if (Cu.isDeadWrapper(this.observer)) { + // This observer is no longer alive. + return; + } + for (const node of this.#rootNodes) { + if (Cu.isDeadWrapper(node)) { + // This node is no longer alive. + continue; + } + this.observer.observe(node, { + characterData: true, + childList: true, + subtree: true, + }); + } + } + + /** + * Stop the mutation observer, for instance to apply the translations to the DOM. + */ + stopMutationObserver() { + // Was the window already destroyed? + if (!Cu.isDeadWrapper(this.observer)) { + this.observer.disconnect(); + } + } + + /** + * This is called every `DOM_UPDATE_INTERVAL_MS` ms with translations for nodes. + * + * This function is called asynchronously, so nodes may already be dead. Before + * accessing a node make sure and run `Cu.isDeadWrapper` to check that it is alive. + */ + updateNodesWithTranslations() { + // Stop the mutations so that the updates won't trigger observations. + this.stopMutationObserver(); + + for (const { node, translatedHTML } of this.#nodesWithTranslatedHTML) { + if (Cu.isDeadWrapper(node)) { + // The node is no longer alive. + ChromeUtils.addProfilerMarker( + "Translations", + { innerWindowId: this.innerWindowId }, + "Node is no long alive." + ); + continue; + } + switch (node.nodeType) { + case Node.TEXT_NODE: { + if (translatedHTML.trim().length !== 0) { + // Only update the node if there is new text. + node.textContent = translatedHTML; + } + break; + } + case Node.ELEMENT_NODE: { + // TODO (Bug 1820625) - This is slow compared to the original implementation + // in the addon which set the innerHTML directly. We can't set the innerHTML + // here, but perhaps there is another way to get back some of the performance. + const translationsDocument = this.domParser.parseFromString( + `<!DOCTYPE html><div>${translatedHTML}</div>`, + "text/html" + ); + updateElement(translationsDocument, node); + break; + } + } + } + + this.#nodesWithTranslatedHTML.clear(); + this.#updateTimeout = null; + + // Done mutating the DOM. + this.startMutationObserver(); + } + + /** + * Schedule a node to be updated with a translation. + * + * @param {Node} node + * @param {string} translatedHTML + */ + scheduleNodeUpdateWithTranslation(node, translatedHTML) { + // Add the nodes to be populated with the next translation update. + this.#nodesWithTranslatedHTML.add({ node, translatedHTML }); + + if (this.#pendingTranslationsCount === 0) { + // No translations are pending, update the node. + this.updateNodesWithTranslations(); + } else if (!this.#updateTimeout) { + // Schedule an update. + this.#updateTimeout = lazy.setTimeout( + this.updateNodesWithTranslations.bind(this), + DOM_UPDATE_INTERVAL_MS + ); + } else { + // An update has been previously scheduled, do nothing here. + } + } + + /** + * Check to see if a language matches the document language. + * + * @param {Node} node + */ + matchesDocumentLanguage(node) { + if (!node.lang) { + // No `lang` was present, so assume it matches the language. + return true; + } + + // First, cheaply check if language tags match, without canonicalizing. + if (langTagsMatch(this.documentLanguage, node.lang)) { + return true; + } + + try { + // Make sure the local is in the canonical form, and check again. This function + // throws, so don't trust that the language tags are formatting correctly. + const [language] = Intl.getCanonicalLocales(node.lang); + + return langTagsMatch(this.documentLanguage, language); + } catch (_error) { + return false; + } + } +} + +/** + * This function needs to be fairly fast since it's used on many nodes when iterating + * over the DOM to find nodes to translate. + * + * @param {Text | HTMLElement} node + */ +function isNodeHidden(node) { + /** @type {HTMLElement} */ + const element = node.nodeType === Node.TEXT_NODE ? node.parentElement : node; + + // This flushes the style, which is a performance cost. + const style = element.ownerGlobal.getComputedStyle(element); + return style.display === "none" || style.visibility === "hidden"; +} + +/** + * This function cheaply checks that language tags match. + * + * @param {string} knownLanguage + * @param {string} otherLanguage + */ +function langTagsMatch(knownLanguage, otherLanguage) { + if (knownLanguage === otherLanguage) { + // A simple direct match. + return true; + } + if (knownLanguage.length !== 2) { + throw new Error("Expected the knownLanguage to be of length 2."); + } + // Check if the language tags part match, e.g. "en" and "en-US". + return ( + knownLanguage[0] === otherLanguage[0] && + knownLanguage[1] === otherLanguage[1] && + otherLanguage[2] === "-" + ); +} + +/** + * This function runs when walking the DOM, which means it is a hot function. It runs + * fairly fast even though it is computing the bounding box. This is all done in a tight + * loop, and it is done on mutations. Care should be taken with reflows caused by + * getBoundingClientRect, as this is a common performance issue. + * + * The following are the counts of how often this is run on a news site: + * + * Given: + * 1573 DOM nodes + * 504 Text nodes + * 1069 Elements + * + * There were: + * 209 calls to get this funcion. + * + * @param {Node} node + */ +function isNodeInViewport(node) { + const window = node.ownerGlobal; + const document = node.ownerDocument; + + /** @type {HTMLElement} */ + const element = node.nodeType === Node.TEXT_NODE ? node.parentElement : node; + + const rect = element.getBoundingClientRect(); + return ( + rect.top >= 0 && + rect.left >= 0 && + rect.bottom <= + (window.innerHeight || document.documentElement.clientHeight) && + rect.right <= (window.innerWidth || document.documentElement.clientWidth) + ); +} + +/** + * Actually perform the update of the element with the translated node. This step + * will detach all of the "live" nodes, and match them up in the correct order as provided + * by the translations engine. + * + * @param {Document} translationsDocument + * @param {Element} element + * @returns {void} + */ +function updateElement(translationsDocument, element) { + // This text should have the same layout as the target, but it's not completely + // guaranteed since the content page could change at any time, and the translation process is async. + // + // The document has the following structure: + // + // <html> + // <head> + // <body>{translated content}</body> + // </html> + + const originalHTML = element.innerHTML; + + /** + * The Set of translation IDs for nodes that have been cloned. + * @type {Set<number>} + */ + const clonedNodes = new Set(); + + merge(element, translationsDocument.body.firstChild); + + /** + * Merge the live tree with the translated tree by re-using elements from the live tree. + * + * @param {Node} liveTree + * @param {Node} translatedTree + */ + function merge(liveTree, translatedTree) { + /** @type {Map<number, Element>} */ + const liveElementsById = new Map(); + + /** @type {Array<Text>} */ + const liveTextNodes = []; + + // Remove all the nodes from the liveTree, and categorize them by Text node or + // Element node. + let node; + while ((node = liveTree.firstChild)) { + node.remove(); + + if (node.nodeType === Node.ELEMENT_NODE) { + liveElementsById.set(node.dataset.mozTranslationsId, node); + } else if (node.nodeType === Node.TEXT_NODE) { + liveTextNodes.push(node); + } + } + + // The translated tree dictates the order. + const translatedNodes = translatedTree.childNodes; + for ( + let translatedIndex = 0; + translatedIndex < translatedNodes.length; + translatedIndex++ + ) { + const translatedNode = translatedNodes[translatedIndex]; + + if (translatedNode.nodeType === Node.TEXT_NODE) { + // Copy the translated text to the original Text node and re-append it. + let liveTextNode = liveTextNodes.shift(); + + if (liveTextNode) { + liveTextNode.data = translatedNode.data; + } else { + liveTextNode = translatedNode; + } + + liveTree.appendChild(liveTextNode); + } else if (translatedNode.nodeType === Node.ELEMENT_NODE) { + const translationsId = translatedNode.dataset.mozTranslationsId; + // Element nodes try to use the already existing DOM nodes. + + // Find the element in the live tree that matches the one in the translated tree. + let liveElement = liveElementsById.get(translationsId); + + if (!liveElement) { + lazy.console.warn("Could not find a corresponding live element", { + path: createNodePath(translatedNode, translationsDocument.body), + translationsId, + liveElementsById, + translatedNode, + }); + continue; + } + + // Has this element already been added to the list? Then duplicate it and re-add + // it as a clone. The Translations Engine can sometimes duplicate HTML. + if (liveElement.parentNode) { + liveElement = liveElement.cloneNode(true /* deep clone */); + clonedNodes.add(translationsId); + lazy.console.warn( + "Cloning a node because it was already inserted earlier", + { + path: createNodePath(translatedNode, translationsDocument.body), + translatedNode, + liveElement, + } + ); + } + + if (isNodeTextEmpty(translatedNode)) { + // The original node had text, but the one that came out of translation + // didn't have any text. This scenario might be caused by one of two causes: + // + // 1) The element was duplicated by translation but then not given text + // content. This happens on Wikipedia articles for example. + // + // 2) The translator messed up and could not translate the text. This + // happens on YouTube in the language selector. In that case, having the + // original text is much better than no text at all. + // + // To make sure it is case 1 and not case 2 check whether this is the only occurrence. + for (let i = 0; i < translatedNodes.length; i++) { + if (translatedIndex === i) { + // This is the current node, not a sibling. + continue; + } + const sibling = translatedNodes[i]; + if ( + // Only consider other element nodes. + sibling.nodeType === Node.ELEMENT_NODE && + // If the sibling's translationsId matches, then use the sibling's + // node instead. + translationsId === sibling.dataset.mozTranslationsId + ) { + // This is case 1 from above. Remove this element's original text nodes, + // since a sibling text node now has all of the text nodes. + removeTextNodes(liveElement); + } + } + + // Report this issue to the console. + lazy.console.warn( + "The translated element has no text even though the original did.", + { + path: createNodePath(translatedNode, translationsDocument.body), + translatedNode, + liveElement, + } + ); + } else if (!isNodeTextEmpty(liveElement)) { + // There are still text nodes to find and update, recursively merge. + merge(liveElement, translatedNode); + } + + // Put the live node back in the live branch. But now t has been synced with the + // translated text and order. + liveTree.appendChild(liveElement); + } + } + + const unhandledElements = [...liveElementsById].filter( + ([, element]) => !element.parentNode + ); + + if (unhandledElements.length) { + lazy.console.warn( + `${createNodePath( + translatedTree, + translationsDocument.body + )} Not all nodes unified`, + { + unhandledElements, + clonedNodes, + originalHTML, + translatedHTML: translationsDocument.body.innerHTML, + liveTree: liveTree.outerHTML, + translatedTree: translatedTree.outerHTML, + } + ); + } + } +} + +/** + * For debug purposes, compute a string path to an element. + * + * e.g. "div/div#header/p.bold.string/a" + * + * @param {Node} node + * @param {Node | null} root + */ +function createNodePath(node, root) { + if (root === null) { + root = node.ownerDocument.body; + } + let path = + node.parentNode && node.parentNode !== root + ? createNodePath(node.parentNode) + : ""; + path += `/${node.nodeName}`; + if (node.id) { + path += `#${node.id}`; + } else if (node.className) { + for (const className of node.classList) { + path += "." + className; + } + } + return path; +} + +/** + * @param {Node} node + * @returns {boolean} + */ +function isNodeTextEmpty(node) { + if ("innerText" in node) { + return node.innerText.trim().length === 0; + } + if (node.nodeType === Node.TEXT_NODE && node.nodeValue) { + return node.nodeValue.trim().length === 0; + } + return true; +} + +/** + * @param {Node} node + */ +function removeTextNodes(node) { + for (const child of node.childNodes) { + switch (child.nodeType) { + case Node.TEXT_NODE: + node.removeChild(child); + break; + case Node.ELEMENT_NODE: + removeTextNodes(child); + break; + default: + break; + } + } +} + +/** + * Test whether any of the direct child text nodes of are non-whitespace + * text nodes. + * + * For example: + * - `<p>test</p>`: yes + * - `<p> </p>`: no + * - `<p><b>test</b></p>`: no + * @param {Node} node + * @returns {boolean} + */ +function hasTextNodes(node) { + if (node.nodeType !== Node.ELEMENT_NODE) { + // Only check element nodes. + return false; + } + + for (const child of node.childNodes) { + if (child.nodeType === Node.TEXT_NODE) { + if (child.textContent.trim() === "") { + // This is just whitespace. + continue; + } + // A text node with content was found. + return true; + } + } + + // No text nodes were found. + return false; +} + +/** + * Like `isExcludedNode` but looks at the full subtree. Used to see whether + * we can submit a subtree, or whether we should split it into smaller + * branches first to try to exclude more of the non-translatable content. + * + * @param {Node} node + * @param {string} excludedNodeSelector + * @returns {boolean} + */ +function containsExcludedNode(node, excludedNodeSelector) { + return ( + node.nodeType === Node.ELEMENT_NODE && + node.querySelector(excludedNodeSelector) + ); +} + +/** + * Check if this node has already been queued to be translated. This can be because + * the node is itself is queued, or its parent node is queued. + * + * @param {Node} node + * @param {Map<Node, any>} queuedNodes + * @returns {boolean} + */ +function isNodeQueued(node, queuedNodes) { + if (queuedNodes.has(node)) { + return true; + } + + // If the immediate parent is the body, it is allowed. + if (node.parentNode === node.ownerDocument.body) { + return false; + } + + // Accessing the parentNode is expensive here according to performance profilling. This + // is due to XrayWrappers. Minimize reading attributes by storing a reference to the + // `parentNode` in a named variable, rather than re-accessing it. + let parentNode; + let lastNode = node; + while ((parentNode = lastNode.parentNode)) { + if (queuedNodes.has(parentNode)) { + return parentNode; + } + lastNode = parentNode; + } + + return false; +} + +/** + * Test whether this node should be treated as a wrapper of text, e.g. + * a `<p>`, or as a wrapper for block elements, e.g. `<div>`, based on + * its ratio of assumed inline elements, and assumed "block" elements. If it is a wrapper + * of block elements, then it needs more subdividing. This algorithm is based on + * heuristics and is a best effort attempt at sorting contents without actually computing + * the style of every element. + * + * If it's a Text node, it's inline and doesn't need subdividing. + * + * "Lorem ipsum" + * + * If it is mostly filled with assumed "inline" elements, treat it as inline. + * <p> + * Lorem ipsum dolor sit amet, consectetur adipiscing elit. + * <b>Nullam ut finibus nibh</b>, at tincidunt tellus. + * </p> + * + * Since it has 3 "inline" elements. + * 1. "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + * 2. <b>Nullam ut finibus nibh</b> + * 3. ", at tincidunt tellus." + * + * If it's mostly filled with block elements, do not treat it as inline, as it will + * need more subdividing. + * + * <section> + * Lorem ipsum <strong>dolor sit amet.</strong> + * <div>Nullam ut finibus nibh, at tincidunt tellus.</div> + * <div>Morbi pharetra mauris sed nisl mollis molestie.</div> + * <div>Donec et nibh sit amet velit tincidunt auctor.</div> + * </section> + * + * This node has 2 presumed "inline" elements: + * 1 "Lorem ipsum" + * 2. <strong>dolor sit amet.</strong>. + * + * And the 3 div "block" elements. Since 3 "block" elements > 2 "inline" elements, + * it is presumed to be "inline". + * + * @param {Node} node + * @returns {boolean} + */ +function nodeNeedsSubdividing(node) { + if (node.nodeType === Node.TEXT_NODE) { + // Text nodes are fully subdivided. + return false; + } + + let inlineElements = 0; + let blockElements = 0; + + if (node.nodeName === "TR") { + // TR elements always need subdividing, since the cells are the individual "inline" + // units. For instance the following would be invalid markup: + // + // <tr> + // This is <b>invalid</b> + // </tr> + // + // You will always have the following, which will need more subdividing. + // + // <tr> + // <td>This is <b>valid</b>.</td> + // <td>This is still valid.</td> + // </tr> + return true; + } + + for (let child of node.childNodes) { + switch (child.nodeType) { + case Node.TEXT_NODE: + if (!isNodeTextEmpty(child)) { + inlineElements += 1; + } + break; + case Node.ELEMENT_NODE: { + // Property access can be expensive, so destructure the required properties. + const { nodeName } = child; + if (INLINE_TAGS.has(nodeName)) { + inlineElements += 1; + } else if (GENERIC_TAGS.has(nodeName) && !nodeNeedsSubdividing(child)) { + inlineElements += 1; + } else { + blockElements += 1; + } + break; + } + default: + break; + } + } + + return inlineElements < blockElements; +} + +/** + * Returns an iterator of a node's ancestors. + * + * @param {Node} node + * @returns {Generator<ParentNode>} + */ +function* getAncestorsIterator(node) { + const document = node.ownerDocument; + for ( + let parent = node.parentNode; + parent && parent !== document.documentElement; + parent = parent.parentNode + ) { + yield parent; + } +} diff --git a/toolkit/components/translations/content/translations-engine-worker.js b/toolkit/components/translations/content/translations-engine-worker.js new file mode 100644 index 0000000000..3cf12d1e92 --- /dev/null +++ b/toolkit/components/translations/content/translations-engine-worker.js @@ -0,0 +1,780 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* eslint-env mozilla/chrome-worker */ +"use strict"; + +/** + * @typedef {import("../translations").Bergamot} Bergamot + * @typedef {import("../translations").LanguageTranslationModelFiles} LanguageTranslationModelFiles + */ + +/* global loadBergamot */ +importScripts("chrome://global/content/translations/bergamot-translator.js"); + +// Respect the preference "browser.translations.logLevel". +let _loggingLevel = "Error"; +function log(...args) { + if (_loggingLevel !== "Error" && _loggingLevel !== "Warn") { + console.log("Translations:", ...args); + } +} +function trace(...args) { + if (_loggingLevel === "Trace" || _loggingLevel === "All") { + console.log("Translations:", ...args); + } +} + +// Throw Promise rejection errors so that they are visible in the console. +self.addEventListener("unhandledrejection", event => { + throw event.reason; +}); + +/** + * The alignment for each file type, file type strings should be same as in the + * model registry. + */ +const MODEL_FILE_ALIGNMENTS = { + model: 256, + lex: 64, + vocab: 64, + qualityModel: 64, + srcvocab: 64, + trgvocab: 64, +}; + +/** + * Initialize the engine, and get it ready to handle translation requests. + * The "initialize" message must be received before any other message handling + * requests will be processed. + */ +addEventListener("message", handleInitializationMessage); + +async function handleInitializationMessage({ data }) { + const startTime = performance.now(); + if (data.type !== "initialize") { + console.error( + "The TranslationEngine worker received a message before it was initialized." + ); + return; + } + + try { + const { fromLanguage, toLanguage, enginePayload, logLevel, innerWindowId } = + data; + + if (!fromLanguage) { + throw new Error('Worker initialization missing "fromLanguage"'); + } + if (!toLanguage) { + throw new Error('Worker initialization missing "toLanguage"'); + } + + if (logLevel) { + // Respect the "browser.translations.logLevel" preference. + _loggingLevel = logLevel; + } + + let engine; + if (enginePayload.isMocked) { + // The engine is testing mode, and no Bergamot wasm is available. + engine = new MockedEngine(fromLanguage, toLanguage); + } else { + const { bergamotWasmArrayBuffer, languageModelFiles } = enginePayload; + const bergamot = await BergamotUtils.initializeWasm( + bergamotWasmArrayBuffer + ); + engine = new Engine( + fromLanguage, + toLanguage, + bergamot, + languageModelFiles + ); + } + + ChromeUtils.addProfilerMarker( + "TranslationsWorker", + { startTime, innerWindowId }, + "Translations engine loaded." + ); + + handleMessages(engine); + postMessage({ type: "initialization-success" }); + } catch (error) { + console.error(error); + postMessage({ type: "initialization-error", error: error?.message }); + } + + removeEventListener("message", handleInitializationMessage); +} + +/** + * Sets up the message handling for the worker. + * + * @param {Engine | MockedEngine} engine + */ +function handleMessages(engine) { + let discardPromise; + addEventListener("message", async ({ data }) => { + try { + if (data.type === "initialize") { + throw new Error("The Translations engine must not be re-initialized."); + } + if (data.type === "translation-request") { + // Only show these messages when "All" logging is on, since there are so many + // of them. + trace("Received message", data); + } else { + log("Received message", data); + } + + switch (data.type) { + case "translation-request": { + const { messageBatch, messageId, isHTML, innerWindowId } = data; + if (discardPromise) { + // Wait for messages to be discarded if there are any. + await discardPromise; + } + try { + // Add translations to the work queue, and when they return, post the message + // back. The translation may never return if the translations are discarded + // before they have time to be run. In this case this await is just never + // resolved, and the postMessage is never run. + const translations = await engine.translate( + messageBatch, + isHTML, + innerWindowId + ); + + // This logging level can be very verbose and slow, so only do it under the + // "Trace" level, which is the most verbose. Set the logging level to "Info" to avoid + // these, and get all of the other logs. + trace("Translation complete", { + messageBatch, + translations, + isHTML, + innerWindowId, + }); + + postMessage({ + type: "translation-response", + translations, + messageId, + }); + } catch (error) { + console.error(error); + let message = "An error occurred in the engine worker."; + if (typeof error?.message === "string") { + message = error.message; + } + let stack = "(no stack)"; + if (typeof error?.stack === "string") { + stack = error.stack; + } + postMessage({ + type: "translation-error", + error: { message, stack }, + messageId, + innerWindowId, + }); + } + break; + } + case "discard-translation-queue": { + ChromeUtils.addProfilerMarker( + "TranslationsWorker", + { innerWindowId: data.innerWindowId }, + "Translations discard requested" + ); + + discardPromise = engine.discardTranslations(); + await discardPromise; + discardPromise = null; + + // Signal to the "message" listeners in the main thread to stop listening. + postMessage({ + type: "translations-discarded", + }); + break; + } + default: + console.warn("Unknown message type:", data.type); + } + } catch (error) { + // Ensure the unexpected errors are surfaced in the console. + console.error(error); + } + }); +} + +/** + * The Engine is created once for a language pair. The initialization process copies the + * ArrayBuffers for the language buffers from JS-managed ArrayBuffers, to aligned + * internal memory for the wasm heap. + * + * After this the ArrayBuffers are discarded and GC'd. This file should be managed + * from the TranslationsEngine class on the main thread. + * + * This class starts listening for messages only after the Bergamot engine has been + * fully initialized. + */ +class Engine { + /** + * @param {string} fromLanguage + * @param {string} toLanguage + * @param {Bergamot} bergamot + * @param {Array<LanguageTranslationModelFiles>} languageTranslationModelFiles + */ + constructor( + fromLanguage, + toLanguage, + bergamot, + languageTranslationModelFiles + ) { + /** @type {string} */ + this.fromLanguage = fromLanguage; + /** @type {string} */ + this.toLanguage = toLanguage; + /** @type {Bergamot} */ + this.bergamot = bergamot; + /** @type {Bergamot["TranslationModel"][]} */ + this.languageTranslationModels = languageTranslationModelFiles.map( + languageTranslationModelFiles => + BergamotUtils.constructSingleTranslationModel( + bergamot, + languageTranslationModelFiles + ) + ); + + /** @type {Bergamot["BlockingService"]} */ + this.translationService = new bergamot.BlockingService({ + // Caching is disabled (see https://github.com/mozilla/firefox-translations/issues/288) + cacheSize: 0, + }); + } + + /** + * Run the translation models to perform a batch of message translations. The + * promise is rejected when the sync version of this function throws an error. + * This function creates an async interface over the synchronous translation + * mechanism. This allows other microtasks such as message handling to still work + * even though the translations are CPU-intensive. + * + * @param {string[]} messageBatch + * @param {boolean} isHTML + * @param {number} innerWindowId - This is required + * + * @param {boolean} withQualityEstimation + * @returns {Promise<string[]>} + */ + translate( + messageBatch, + isHTML, + innerWindowId, + withQualityEstimation = false + ) { + return this.#getWorkQueue(innerWindowId).runTask(() => + this.#syncTranslate( + messageBatch, + isHTML, + innerWindowId, + withQualityEstimation + ) + ); + } + + /** + * Map each innerWindowId to its own WorkQueue. This makes it easy to shut down + * an entire queue of work when the page is unloaded. + * + * @type {Map<number, WorkQueue>} + */ + #workQueues = new Map(); + + /** + * Get or create a `WorkQueue` that is unique to an `innerWindowId`. + * + * @param {number} innerWindowId + * @returns {WorkQueue} + */ + #getWorkQueue(innerWindowId) { + let workQueue = this.#workQueues.get(innerWindowId); + if (workQueue) { + return workQueue; + } + workQueue = new WorkQueue(innerWindowId); + this.#workQueues.set(innerWindowId, workQueue); + return workQueue; + } + + /** + * Cancels any in-progress translations by removing the work queue. + * + * @param {number} innerWindowId + */ + discardTranslations(innerWindowId) { + let workQueue = this.#workQueues.get(innerWindowId); + if (workQueue) { + workQueue.cancelWork(); + this.#workQueues.delete(innerWindowId); + } + } + + /** + * Run the translation models to perform a batch of message translations. This + * blocks the worker thread until it is completed. + * + * @param {string[]} messageBatch + * @param {boolean} isHTML + * @param {number} innerWindowId + * @param {boolean} withQualityEstimation + * @returns {string[]} + */ + #syncTranslate( + messageBatch, + isHTML, + innerWindowId, + withQualityEstimation = false + ) { + const startTime = performance.now(); + let response; + const { messages, options } = BergamotUtils.getTranslationArgs( + this.bergamot, + messageBatch, + isHTML, + withQualityEstimation + ); + try { + if (messages.size() === 0) { + return []; + } + + /** @type {Bergamot["VectorResponse"]} */ + let responses; + + if (this.languageTranslationModels.length === 1) { + responses = this.translationService.translate( + this.languageTranslationModels[0], + messages, + options + ); + } else if (this.languageTranslationModels.length === 2) { + responses = this.translationService.translateViaPivoting( + this.languageTranslationModels[0], + this.languageTranslationModels[1], + messages, + options + ); + } else { + throw new Error( + "Too many models were provided to the translation worker." + ); + } + + // Extract JavaScript values out of the vector. + const translations = BergamotUtils.mapVector(responses, response => + response.getTranslatedText() + ); + + // Report on the time it took to do these translations. + let length = 0; + for (const message of messageBatch) { + length += message.length; + } + ChromeUtils.addProfilerMarker( + "TranslationsWorker", + { startTime, innerWindowId }, + `Translated ${length} code units.` + ); + + return translations; + } finally { + // Free up any memory that was allocated. This will always run. + messages?.delete(); + options?.delete(); + response?.delete(); + } + } +} + +/** + * Static utilities to help work with the Bergamot wasm module. + */ +class BergamotUtils { + /** + * Construct a single translation model. + * + * @param {Bergamot} bergamot + * @param {LanguageTranslationModelFiles} languageTranslationModelFiles + * @returns {Bergamot["TranslationModel"]} + */ + static constructSingleTranslationModel( + bergamot, + languageTranslationModelFiles + ) { + log(`Constructing translation model.`); + + const { model, lex, vocab, qualityModel, srcvocab, trgvocab } = + BergamotUtils.allocateModelMemory( + bergamot, + languageTranslationModelFiles + ); + + // Transform the bytes to mb, like "10.2mb" + const getMemory = memory => `${Math.floor(memory.size() / 100_000) / 10}mb`; + + let memoryLog = `Model memory sizes in wasm heap:`; + memoryLog += `\n Model: ${getMemory(model)}`; + memoryLog += `\n Shortlist: ${getMemory(lex)}`; + + // Set up the vocab list, which could either be a single "vocab" model, or a + // "srcvocab" and "trgvocab" pair. + const vocabList = new bergamot.AlignedMemoryList(); + + if (vocab) { + vocabList.push_back(vocab); + memoryLog += `\n Vocab: ${getMemory(vocab)}`; + } else if (srcvocab && trgvocab) { + vocabList.push_back(srcvocab); + vocabList.push_back(trgvocab); + memoryLog += `\n Src Vocab: ${getMemory(srcvocab)}`; + memoryLog += `\n Trg Vocab: ${getMemory(trgvocab)}`; + } else { + throw new Error("Vocabulary key is not found."); + } + + if (qualityModel) { + memoryLog += `\n QualityModel: ${getMemory(qualityModel)}\n`; + } + + const config = BergamotUtils.generateTextConfig({ + "beam-size": "1", + normalize: "1.0", + "word-penalty": "0", + "max-length-break": "128", + "mini-batch-words": "1024", + workspace: "128", + "max-length-factor": "2.0", + "skip-cost": (!qualityModel).toString(), + "cpu-threads": "0", + quiet: "true", + "quiet-translation": "true", + "gemm-precision": + languageTranslationModelFiles.model.record.name.endsWith("intgemm8.bin") + ? "int8shiftAll" + : "int8shiftAlphaAll", + alignment: "soft", + }); + + log(`Bergamot translation model config: ${config}`); + log(memoryLog); + + return new bergamot.TranslationModel( + config, + model, + lex, + vocabList, + qualityModel ?? null + ); + } + + /** + * The models must be placed in aligned memory that the Bergamot wasm module has access + * to. This function copies over the model blobs into this memory space. + * + * @param {Bergamot} bergamot + * @param {LanguageTranslationModelFiles} languageTranslationModelFiles + * @returns {LanguageTranslationModelFilesAligned} + */ + static allocateModelMemory(bergamot, languageTranslationModelFiles) { + /** @type {LanguageTranslationModelFilesAligned} */ + const results = {}; + + for (const [fileType, file] of Object.entries( + languageTranslationModelFiles + )) { + const alignment = MODEL_FILE_ALIGNMENTS[fileType]; + if (!alignment) { + throw new Error(`Unknown file type: "${fileType}"`); + } + + const alignedMemory = new bergamot.AlignedMemory( + file.buffer.byteLength, + alignment + ); + + alignedMemory.getByteArrayView().set(new Uint8Array(file.buffer)); + + results[fileType] = alignedMemory; + } + + return results; + } + + /** + * Initialize the Bergamot translation engine. It is a wasm compiled version of the + * Marian translation software. The wasm is delivered remotely to cut down on binary size. + * + * https://github.com/mozilla/bergamot-translator/ + * + * @param {ArrayBuffer} wasmBinary + * @returns {Promise<Bergamot>} + */ + static initializeWasm(wasmBinary) { + return new Promise((resolve, reject) => { + /** @type {number} */ + let start = performance.now(); + + /** @type {Bergamot} */ + const bergamot = loadBergamot({ + // This is the amount of memory that a simple run of Bergamot uses, in byte. + INITIAL_MEMORY: 459_276_288, + preRun: [], + onAbort() { + reject(new Error("Error loading Bergamot wasm module.")); + }, + onRuntimeInitialized: async () => { + const duration = performance.now() - start; + log( + `Bergamot wasm runtime initialized in ${duration / 1000} seconds.` + ); + // Await at least one microtask so that the captured `bergamot` variable is + // fully initialized. + await Promise.resolve(); + resolve(bergamot); + }, + wasmBinary, + }); + }); + } + + /** + * Maps the Bergamot Vector to a JS array + * + * @param {Bergamot["Vector"]} vector + * @param {Function} fn + * @returns {Array} + */ + static mapVector(vector, fn) { + const result = []; + for (let index = 0; index < vector.size(); index++) { + result.push(fn(vector.get(index), index)); + } + return result; + } + + /** + * Generate a config for the Marian translation service. It requires specific whitespace. + * + * https://marian-nmt.github.io/docs/cmd/marian-decoder/ + * + * @param {Record<string, string>} config + * @returns {string} + */ + static generateTextConfig(config) { + const indent = " "; + let result = "\n"; + + for (const [key, value] of Object.entries(config)) { + result += `${indent}${key}: ${value}\n`; + } + + return result + indent; + } + + /** + * JS objects need to be translated into wasm objects to configure the translation engine. + * + * @param {Bergamot} bergamot + * @param {string[]} messageBatch + * @param {boolean} withQualityEstimation + * @returns {{ messages: Bergamot["VectorString"], options: Bergamot["VectorResponseOptions"] }} + */ + static getTranslationArgs( + bergamot, + messageBatch, + isHTML, + withQualityEstimation + ) { + const messages = new bergamot.VectorString(); + const options = new bergamot.VectorResponseOptions(); + for (let message of messageBatch) { + message = message.trim(); + // Empty paragraphs break the translation. + if (message === "") { + continue; + } + + if (withQualityEstimation && !isHTML) { + // Bergamot only supports quality estimates with HTML. Purely text content can + // be translated by escaping it as HTML. See: + // https://github.com/mozilla/firefox-translations/blob/431e0d21f22694c1cbc0ff965820d9780cdaeea8/extension/controller/translation/translationWorker.js#L146-L158 + throw new Error( + "Quality estimates on non-hTML is not curently supported." + ); + } + + messages.push_back(message); + options.push_back({ + qualityScores: withQualityEstimation, + alignment: true, + html: isHTML, + }); + } + return { messages, options }; + } +} + +/** + * For testing purposes, provide a fully mocked engine. This allows for easy integration + * testing of the UI, without having to rely on downloading remote models and remote + * wasm binaries. + */ +class MockedEngine { + /** + * @param {string} fromLanguage + * @param {string} toLanguage + */ + constructor(fromLanguage, toLanguage) { + /** @type {string} */ + this.fromLanguage = fromLanguage; + /** @type {string} */ + this.toLanguage = toLanguage; + } + + /** + * Create a fake translation of the text. + * + * @param {string[]} messageBatch + * @param {bool} isHTML + * @returns {string} + */ + translate(messageBatch, isHTML) { + return messageBatch.map(message => { + // Note when an HTML translations is requested. + let html = isHTML ? ", html" : ""; + message = message.toUpperCase(); + + return `${message} [${this.fromLanguage} to ${this.toLanguage}${html}]`; + }); + } + + discardTranslations() {} +} + +/** + * This class takes tasks that may block the thread's event loop, and has them yield + * after a time budget via setTimeout calls to allow other code to execute. + */ +class WorkQueue { + #TIME_BUDGET = 100; // ms + #RUN_IMMEDIATELY_COUNT = 20; + + /** @type {Array<{task: Function, resolve: Function}>} */ + #tasks = []; + #isRunning = false; + #isWorkCancelled = false; + #runImmediately = this.#RUN_IMMEDIATELY_COUNT; + + /** + * @param {number} innerWindowId + */ + constructor(innerWindowId) { + this.innerWindowId = innerWindowId; + } + + /** + * Run the task and return the result. + * + * @template {T} + * @param {() => T} task + * @returns {Promise<T>} + */ + runTask(task) { + if (this.#runImmediately > 0) { + // Run the first N translations immediately, most likely these are the user-visible + // translations on the page, as they are sent in first. The setTimeout of 0 can + // still delay the translations noticeably. + this.#runImmediately--; + return Promise.resolve(task()); + } + return new Promise((resolve, reject) => { + this.#tasks.push({ task, resolve, reject }); + this.#run().catch(error => console.error(error)); + }); + } + + /** + * The internal run function. + */ + async #run() { + if (this.#isRunning) { + // The work queue is already running. + return; + } + + this.#isRunning = true; + + // Measure the timeout + let lastTimeout = null; + + let tasksInBatch = 0; + const addProfilerMarker = () => { + ChromeUtils.addProfilerMarker( + "TranslationsWorker WorkQueue", + { startTime: lastTimeout, innerWindowId: this.innerWindowId }, + `WorkQueue processed ${tasksInBatch} tasks` + ); + }; + + while (this.#tasks.length !== 0) { + if (this.#isWorkCancelled) { + // The work was already cancelled. + break; + } + const now = performance.now(); + + if (lastTimeout === null) { + lastTimeout = now; + // Allow other work to get on the queue. + await new Promise(resolve => setTimeout(resolve, 0)); + } else if (now - lastTimeout > this.#TIME_BUDGET) { + // Perform a timeout with no effective wait. This clears the current + // promise queue from the event loop. + await new Promise(resolve => setTimeout(resolve, 0)); + addProfilerMarker(); + lastTimeout = performance.now(); + } + + // Check this between every `await`. + if (this.#isWorkCancelled) { + break; + } + + tasksInBatch++; + const { task, resolve, reject } = this.#tasks.shift(); + try { + const result = await task(); + + // Check this between every `await`. + if (this.#isWorkCancelled) { + break; + } + // The work is done, resolve the original task. + resolve(result); + } catch (error) { + reject(error); + } + } + addProfilerMarker(); + this.isRunning = false; + } + + async cancelWork() { + this.#isWorkCancelled = true; + this.#tasks = []; + await new Promise(resolve => setTimeout(resolve, 0)); + this.#isWorkCancelled = false; + } +} diff --git a/toolkit/components/translations/content/translations.css b/toolkit/components/translations/content/translations.css new file mode 100644 index 0000000000..094ebde75d --- /dev/null +++ b/toolkit/components/translations/content/translations.css @@ -0,0 +1,174 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +:root { + /* Provide defaults for when this page is viewed in "toolkit". */ + background-color: var(--in-content-page-background, #fff); + color: var(--in-content-page-color, #15141a); + + /* Provide backup values for some of the variables used in "browser" so that the styles + look nice by default in "toolkit". */ + --AT-box-background: var(--in-content-box-background, #fff); + --AT-box-border-color: var(--in-content-box-border-color, #9e9ea0); + --AT-box-info-background: var(--in-content-box-info-background, #f0f0f4); + + /* Variables used in the page layout */ + --AT-page-margin: 20px; + --AT-input-padding: 20px; + /* This is somewhat arbitrary, but works well for the current design. If the computed + header height changes, this will need to be adjusted. */ + --AT-header-height: 156px; + --AT-input-height: calc(min(400px, calc(100vh - var(--AT-header-height)))); + --AT-select-arrow-inset: 5px; +} + +h1 { + /* Provide this style for "toolkit". It is defined in "browser" */ + font-weight: lighter; +} + +body { + display: flex; + justify-content: center; + align-items: center; + inset: 0; + position: absolute; + visibility: hidden; + flex-direction: column; +} + +.about-translations-header { + display: flex; +} + +.about-translations-header > * { + flex: 1; + display: flex; + max-width: 50%; +} + +.about-translations-header-start { + justify-content: start; +} + +.about-translations-header-end { + justify-content: end; +} + +/* Increase the selector specificity to override the base `select` styles. */ +select.about-translations-select { + position: relative; + padding-inline: 10px 20px; + padding-block: 0px; + min-width: 50%; + margin: 5px; + background-position: right var(--AT-select-arrow-inset) center; +} + +select.about-translations-select:dir(rtl) { + background-position-x: left var(--AT-select-arrow-inset); +} + +.about-translations-contents { + display: flex; + flex-direction: column; + box-sizing: border-box; + width: calc(100% - var(--AT-page-margin) * 2); + max-width: 1200px; + background-color: var(--AT-box-background); + border: 1px solid var(--AT-box-border-color); + border-radius: 4px; +} + +.about-translations-input { + display: flex; + width: 100%; + border-top: 1px solid var(--AT-box-border-color); +} + +.about-translations-input-start { + border-inline-end: 1px solid var(--AT-box-border-color); +} + +.about-translations-input > * { + position: relative; + width: 50%; +} + +.about-translations-input-textarea { + /* Override user's dragging of the textarea width. */ + width: 100% !important; + height: var(--AT-input-height); + box-sizing: border-box; + margin: 0; + padding: var(--AT-input-padding); + border: 0; +} + +.about-translations-input-results-blank { + opacity: 0.7; +} + +.about-translations-input-results { + position: absolute; + inset: 0; + padding: var(--AT-input-padding); + box-sizing: border-box; + overflow-y: scroll; +} + +.about-translations-info { + display: none; + padding: 10px; + background-color: var(--AT-box-info-background); + border-radius: 4px; + margin-bottom: var(--AT-input-padding); +} + +.about-translations-info-message { + flex: 1; + align-self: center; +} + +.about-translations-info-icon { + width: 16px; + height: 16px; + margin: 10px; + background-image: url('chrome://global/skin/icons/info.svg'); + -moz-context-properties: fill; + fill: currentColor; +} + +@media (max-width: 700px) { + :root { + --AT-page-margin: 10px; + } + h1 { + margin-top: 15px; + } + body { + padding-bottom: var(--AT-page-margin); + } + .about-translations-input { + flex-direction: column; + flex: 1; + } + .about-translations-input-textarea, + .about-translations-input { + font-size: 16px; + } + .about-translations-input > * { + width: 100%; + flex: 1; + } + .about-translations-input-end { + border-top: 1px solid var(--AT-box-border-color); + } + .about-translations-input-textarea { + height: 100%; + } + .about-translations-contents { + flex: 1; + } +} diff --git a/toolkit/components/translations/content/translations.html b/toolkit/components/translations/content/translations.html new file mode 100644 index 0000000000..bd2c114a0a --- /dev/null +++ b/toolkit/components/translations/content/translations.html @@ -0,0 +1,70 @@ +<!-- This Source Code Form is subject to the terms of the Mozilla Public + - License, v. 2.0. If a copy of the MPL was not distributed with this + - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> + +<!DOCTYPE html> +<html> + <head> + <meta charset="utf-8"> + <meta http-equiv="Content-Security-Policy" content="default-src chrome:; object-src 'none'"> + <meta name="color-scheme" content="light dark"> + <meta name="viewport" content="width=device-width" /> + <title data-l10n-id="about-translations-title"></title> + <link rel="stylesheet" href="chrome://global/skin/global.css"> + <link rel="stylesheet" href="chrome://global/skin/in-content/common.css"> + <link rel="stylesheet" href="chrome://global/content/translations/translations.css"> + <link rel="localization" href="toolkit/branding/brandings.ftl"/> + <link rel="localization" href="locales-preview/aboutTranslations.ftl"/> + <script type="module" src="chrome://global/content/translations/translations.mjs"></script> + </head> + <body> + <h1 data-l10n-id="about-translations-header"></h1> + <main class="about-translations-contents"> + + <header class="about-translations-header"> + <div class="about-translations-header-start"> + <select + class="about-translations-select" + id="language-from" + disabled> + <option data-l10n-id="about-translations-detect" value="detect"></option> + </select> + </div> + <div class="about-translations-header-end"> + <select + class="about-translations-select" + id="language-to" + disabled> + <option data-l10n-id="about-translations-select" value=""></option> + </select> + </div> + </header> + + <main class="about-translations-input"> + <div class="about-translations-input-start"> + <textarea + class="about-translations-input-textarea" + data-l10n-id="about-translations-textarea" + id="translation-from" + ></textarea> + </div> + <div class="about-translations-input-end"> + <div + class="about-translations-input-results about-translations-input-results-blank" + id="translation-to-blank"> + <div class="about-translations-info" id="translation-info"> + <div class="about-translations-info-icon"></div> + <div class="about-translations-info-message" id="translation-info-message"></div> + </div> + <div data-l10n-id="about-translations-results-placeholder"></div> + </div> + <div + class="about-translations-input-results" + id="translation-to"> + </div> + </div> + </main> + + </div> + </body> +</html> diff --git a/toolkit/components/translations/content/translations.mjs b/toolkit/components/translations/content/translations.mjs new file mode 100644 index 0000000000..b279b03b8b --- /dev/null +++ b/toolkit/components/translations/content/translations.mjs @@ -0,0 +1,690 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// The following globals are injected via the AboutTranslationsChild actor. +// translations.mjs is running in an unprivileged context, and these injected functions +// allow for the page to get access to additional privileged features. + +/* global AT_getSupportedLanguages, AT_log, AT_getScriptDirection, + AT_logError, AT_destroyTranslationsEngine, AT_createTranslationsEngine, + AT_isTranslationEngineSupported, AT_createLanguageIdEngine, AT_translate, AT_identifyLanguage */ + +// Allow tests to override this value so that they can run faster. +// This is the delay in milliseconds. +window.DEBOUNCE_DELAY = 200; +// Allow tests to test the debounce behavior by counting debounce runs. +window.DEBOUNCE_RUN_COUNT = 0; + +/** + * @typedef {import("../translations").SupportedLanguages} SupportedLanguages + */ + +/** + * The model and controller for initializing about:translations. + */ +class TranslationsState { + /** + * This class is responsible for all UI updated. + * + * @type {TranslationsUI} + */ + ui; + + /** + * The language to translate from, in the form of a BCP 47 language tag, + * e.g. "en" or "fr". + * + * @type {string} + */ + fromLanguage = ""; + + /** + * The language to translate to, in the form of a BCP 47 language tag, + * e.g. "en" or "fr". + * + * @type {string} + */ + toLanguage = ""; + + /** + * The message to translate, cached so that it can be determined if the text + * needs to be re-translated. + * + * @type {string} + */ + messageToTranslate = ""; + + /** + * Only send one translation in at a time to the worker. + * @type {Promise<string[]>} + */ + translationRequest = Promise.resolve([]); + + /** + * The translations engine is only valid for a single language pair, and needs + * to be recreated if the language pair changes. + * + * @type {null | Promise<TranslationsEngine>} + */ + translationsEngine = null; + + /** + * @param {boolean} isSupported + */ + constructor(isSupported) { + /** + * Is the engine supported by the device? + * @type {boolean} + */ + this.isTranslationEngineSupported = isSupported; + + /** + * Allow code to wait for the engine to be created. + * @type {Promise<void>} + */ + this.languageIdEngineCreated = isSupported + ? AT_createLanguageIdEngine() + : Promise.resolve(); + + /** + * @type {SupportedLanguages} + */ + this.supportedLanguages = isSupported + ? AT_getSupportedLanguages() + : Promise.resolve([]); + + this.ui = new TranslationsUI(this); + this.ui.setup(); + + // Set the UI as ready after all of the state promises have settled. + Promise.allSettled([ + this.languageIdEngineCreated, + this.supportedLanguages, + ]).then(() => { + this.ui.setAsReady(); + }); + } + + /** + * Identifies the human language in which the message is written and returns + * the BCP 47 language tag of the language it is determined to be. + * + * e.g. "en" for English. + * + * @param {string} message + */ + async identifyLanguage(message) { + await this.languageIdEngineCreated; + const start = performance.now(); + const { langTag, confidence } = await AT_identifyLanguage(message); + const duration = performance.now() - start; + AT_log( + `[ ${langTag}(${(confidence * 100).toFixed(2)}%) ]`, + `Source language identified in ${duration / 1000} seconds` + ); + return langTag; + } + + /** + * Only request a translation when it's ready. + */ + maybeRequestTranslation = debounce({ + /** + * Debounce the translation requests so that the worker doesn't fire for every + * single keyboard input, but instead the keyboard events are ignored until + * there is a short break, or enough events have happened that it's worth sending + * in a new translation request. + */ + onDebounce: async () => { + // The contents of "this" can change between async steps, store a local variable + // binding of these values. + const { + fromLanguage, + toLanguage, + messageToTranslate, + translationsEngine, + } = this; + + if (!this.isTranslationEngineSupported) { + // Never translate when the engine isn't supported. + return; + } + + if ( + !fromLanguage || + !toLanguage || + !messageToTranslate || + !translationsEngine + ) { + // Not everything is set for translation. + this.ui.updateTranslation(""); + return; + } + + await Promise.all([ + // Ensure the engine is ready to go. + translationsEngine, + // Ensure the previous translation has finished so that only the latest + // translation goes through. + this.translationRequest, + ]); + + if ( + // Check if the current configuration has changed and if this is stale. If so + // then skip this request, as there is already a newer request with more up to + // date information. + this.translationsEngine !== translationsEngine || + this.fromLanguage !== fromLanguage || + this.toLanguage !== toLanguage || + this.messageToTranslate !== messageToTranslate + ) { + return; + } + + const start = performance.now(); + + this.translationRequest = AT_translate([messageToTranslate]); + const [translation] = await this.translationRequest; + + // The measure events will show up in the Firefox Profiler. + performance.measure( + `Translations: Translate "${this.fromLanguage}" to "${this.toLanguage}" with ${messageToTranslate.length} characters.`, + { + start, + end: performance.now(), + } + ); + + this.ui.updateTranslation(translation); + const duration = performance.now() - start; + AT_log(`Translation done in ${duration / 1000} seconds`); + }, + + // Mark the events so that they show up in the Firefox Profiler. This makes it handy + // to visualize the debouncing behavior. + doEveryTime: () => { + performance.mark( + `Translations: input changed to ${this.messageToTranslate.length} characters` + ); + }, + }); + + /** + * Any time a language pair is changed, the TranslationsEngine needs to be rebuilt. + */ + async maybeRebuildWorker() { + // If we may need to re-building the worker, the old translation is no longer valid. + this.ui.updateTranslation(""); + + // These are cases in which it wouldn't make sense or be possible to load any translations models. + if ( + // If fromLanguage or toLanguage are unpopulated we cannot load anything. + !this.fromLanguage || + !this.toLanguage || + // If fromLanguage's value is "detect", rather than a BCP 47 language tag, then no language + // has been detected yet. + this.fromLanguage === "detect" || + // If fromLanguage and toLanguage are the same, this means that the detected language + // is the same as the toLanguage, and we do not want to translate from one language to itself. + this.fromLanguage === this.toLanguage + ) { + if (this.translationsEngine) { + // The engine is no longer needed. + AT_destroyTranslationsEngine(); + this.translationsEngine = null; + } + return; + } + + const start = performance.now(); + AT_log( + `Rebuilding the translations worker for "${this.fromLanguage}" to "${this.toLanguage}"` + ); + + this.translationsEngine = AT_createTranslationsEngine( + this.fromLanguage, + this.toLanguage + ); + this.maybeRequestTranslation(); + + try { + await this.translationsEngine; + const duration = performance.now() - start; + AT_log(`Rebuilt the TranslationsEngine in ${duration / 1000} seconds`); + } catch (error) { + this.ui.showInfo("about-translations-engine-error"); + AT_logError("Failed to get the Translations worker", error); + } + } + + /** + * Updates the fromLanguage to match the detected language only if the + * about-translations-detect option is selected in the language-from dropdown. + * + * If the new fromLanguage is different than the previous fromLanguage this + * may update the UI to display the new language and may rebuild the translations + * worker if there is a valid selected target language. + */ + async maybeUpdateDetectedLanguage() { + if (!this.ui.detectOptionIsSelected() || this.messageToTranslate === "") { + // If we are not detecting languages or if the message has been cleared + // we should ensure that the UI is not displaying a detected language + // and there is no need to run any language detection. + this.ui.setDetectOptionTextContent(""); + return; + } + + const [langTag, supportedLanguages] = await Promise.all([ + this.identifyLanguage(this.messageToTranslate), + this.supportedLanguages, + ]); + + // Only update the language if the detected language matches + // one of our supported languages. + const entry = supportedLanguages.fromLanguages.find( + ({ langTag: existingTag }) => existingTag === langTag + ); + if (entry) { + const { displayName, isBeta } = entry; + await this.setFromLanguage(langTag); + this.ui.setDetectOptionTextContent(displayName, isBeta); + } + } + + /** + * @param {string} lang + */ + async setFromLanguage(lang) { + if (lang !== this.fromLanguage) { + this.fromLanguage = lang; + await this.maybeRebuildWorker(); + } + } + + /** + * @param {string} lang + */ + setToLanguage(lang) { + if (lang !== this.toLanguage) { + this.toLanguage = lang; + this.maybeRebuildWorker(); + } + } + + /** + * @param {string} message + */ + async setMessageToTranslate(message) { + if (message !== this.messageToTranslate) { + this.messageToTranslate = message; + await this.maybeUpdateDetectedLanguage(); + this.maybeRequestTranslation(); + } + } +} + +/** + * + */ +class TranslationsUI { + /** @type {HTMLSelectElement} */ + languageFrom = document.getElementById("language-from"); + /** @type {HTMLSelectElement} */ + languageTo = document.getElementById("language-to"); + /** @type {HTMLTextAreaElement} */ + translationFrom = document.getElementById("translation-from"); + /** @type {HTMLDivElement} */ + translationTo = document.getElementById("translation-to"); + /** @type {HTMLDivElement} */ + translationToBlank = document.getElementById("translation-to-blank"); + /** @type {HTMLDivElement} */ + translationInfo = document.getElementById("translation-info"); + /** @type {HTMLDivElement} */ + translationInfoMessage = document.getElementById("translation-info-message"); + /** @type {TranslationsState} */ + state; + + /** + * The detect-language option element. We want to maintain a handle to this so that + * we can dynamically update its display text to include the detected language. + * + * @type {HTMLOptionElement} + */ + #detectOption; + + /** + * @param {TranslationsState} state + */ + constructor(state) { + this.state = state; + this.translationTo.style.visibility = "visible"; + this.#detectOption = document.querySelector('option[value="detect"]'); + } + + /** + * Do the initial setup. + */ + setup() { + if (!this.state.isTranslationEngineSupported) { + this.showInfo("about-translations-no-support"); + this.disableUI(); + return; + } + this.setupDropdowns(); + this.setupTextarea(); + } + + /** + * Signals that the UI is ready, for tests. + */ + setAsReady() { + document.body.setAttribute("ready", ""); + } + + /** + * Once the models have been synced from remote settings, populate them with the display + * names of the languages. + */ + async setupDropdowns() { + const supportedLanguages = await this.state.supportedLanguages; + + // Update the DOM elements with the display names. + for (const { + langTag, + isBeta, + displayName, + } of supportedLanguages.toLanguages) { + const option = document.createElement("option"); + option.value = langTag; + if (isBeta) { + document.l10n.setAttributes( + option, + "about-translations-displayname-beta", + { language: displayName } + ); + } else { + option.text = displayName; + } + this.languageTo.add(option); + } + + for (const { + langTag, + isBeta, + displayName, + } of supportedLanguages.fromLanguages) { + const option = document.createElement("option"); + option.value = langTag; + if (isBeta) { + document.l10n.setAttributes( + option, + "about-translations-displayname-beta", + { language: displayName } + ); + } else { + option.text = displayName; + } + this.languageFrom.add(option); + } + + // Enable the controls. + this.languageFrom.disabled = false; + this.languageTo.disabled = false; + + // Focus the language dropdowns if they are empty. + if (this.languageFrom.value == "") { + this.languageFrom.focus(); + } else if (this.languageTo.value == "") { + this.languageTo.focus(); + } + + this.state.setFromLanguage(this.languageFrom.value); + this.state.setToLanguage(this.languageTo.value); + this.updateOnLanguageChange(); + + this.languageFrom.addEventListener("input", () => { + this.state.setFromLanguage(this.languageFrom.value); + this.updateOnLanguageChange(); + }); + + this.languageTo.addEventListener("input", () => { + this.state.setToLanguage(this.languageTo.value); + this.updateOnLanguageChange(); + this.translationTo.setAttribute("lang", this.languageTo.value); + }); + } + + /** + * Show an info message to the user. + * + * @param {string} l10nId + */ + showInfo(l10nId) { + this.translationInfoMessage.setAttribute("data-l10n-id", l10nId); + this.translationInfo.style.display = "flex"; + } + + /** + * Hides the info UI. + */ + hideInfo() { + this.translationInfo.style.display = "none"; + } + + /** + * Returns true if about-translations-detect is the currently + * selected option in the language-from dropdown, otherwise false. + * + * @returns {boolean} + */ + detectOptionIsSelected() { + return this.languageFrom.value === "detect"; + } + + /** + * Sets the textContent of the about-translations-detect option in the + * language-from dropdown to include the detected language's display name. + * + * @param {string} displayName + */ + setDetectOptionTextContent(displayName, isBeta = false) { + // Set the text to the fluent value that takes an arg to display the language name. + if (displayName) { + document.l10n.setAttributes( + this.#detectOption, + isBeta + ? "about-translations-detect-lang-beta" + : "about-translations-detect-lang", + { language: displayName } + ); + } else { + // Reset the text to the fluent value that does not display any language name. + document.l10n.setAttributes( + this.#detectOption, + "about-translations-detect" + ); + } + } + + /** + * React to language changes. + */ + updateOnLanguageChange() { + this.#updateDropdownLanguages(); + this.#updateMessageDirections(); + } + + /** + * You cant translate from one language to another language. Hide the options + * if this is the case. + */ + #updateDropdownLanguages() { + for (const option of this.languageFrom.options) { + option.hidden = false; + } + for (const option of this.languageTo.options) { + option.hidden = false; + } + if (this.state.toLanguage) { + const option = this.languageFrom.querySelector( + `[value=${this.state.toLanguage}]` + ); + if (option) { + option.hidden = true; + } + } + if (this.state.fromLanguage) { + const option = this.languageTo.querySelector( + `[value=${this.state.fromLanguage}]` + ); + if (option) { + option.hidden = true; + } + } + this.state.maybeUpdateDetectedLanguage(); + } + + /** + * Define the direction of the language message text, otherwise it might not display + * correctly. For instance English in an RTL UI would display incorrectly like so: + * + * LTR text in LTR UI: + * + * ┌──────────────────────────────────────────────┐ + * │ This is in English. │ + * └──────────────────────────────────────────────┘ + * + * LTR text in RTL UI: + * ┌──────────────────────────────────────────────┐ + * │ .This is in English │ + * └──────────────────────────────────────────────┘ + * + * LTR text in RTL UI, but in an LTR container: + * ┌──────────────────────────────────────────────┐ + * │ This is in English. │ + * └──────────────────────────────────────────────┘ + * + * The effects are similar, but reversed for RTL text in an LTR UI. + */ + #updateMessageDirections() { + if (this.state.toLanguage) { + this.translationTo.setAttribute( + "dir", + AT_getScriptDirection(this.state.toLanguage) + ); + } else { + this.translationTo.removeAttribute("dir"); + } + if (this.state.fromLanguage) { + this.translationFrom.setAttribute( + "dir", + AT_getScriptDirection(this.state.fromLanguage) + ); + } else { + this.translationFrom.removeAttribute("dir"); + } + } + + setupTextarea() { + this.state.setMessageToTranslate(this.translationFrom.value); + this.translationFrom.addEventListener("input", () => { + this.state.setMessageToTranslate(this.translationFrom.value); + }); + } + + disableUI() { + this.translationFrom.disabled = true; + this.languageFrom.disabled = true; + this.languageTo.disabled = true; + } + + /** + * @param {string} message + */ + updateTranslation(message) { + this.translationTo.innerText = message; + if (message) { + this.translationTo.style.visibility = "visible"; + this.translationToBlank.style.visibility = "hidden"; + this.hideInfo(); + } else { + this.translationTo.style.visibility = "hidden"; + this.translationToBlank.style.visibility = "visible"; + } + } +} + +/** + * Listen for events coming from the AboutTranslations actor. + */ +window.addEventListener("AboutTranslationsChromeToContent", ({ detail }) => { + switch (detail.type) { + case "enable": { + // While the feature is in development, hide the feature behind a pref. See the + // "browser.translations.enable" pref in modules/libpref/init/all.js and Bug 971044 + // for the status of enabling this project. + if (window.translationsState) { + throw new Error("about:translations was already initialized."); + } + AT_isTranslationEngineSupported().then(isSupported => { + window.translationsState = new TranslationsState(isSupported); + }); + document.body.style.visibility = "visible"; + break; + } + default: + throw new Error("Unknown AboutTranslationsChromeToContent event."); + } +}); + +/** + * Debounce a function so that it is only called after some wait time with no activity. + * This is good for grouping text entry via keyboard. + * + * @param {Object} settings + * @param {Function} settings.onDebounce + * @param {Function} settings.doEveryTime + * @returns {Function} + */ +function debounce({ onDebounce, doEveryTime }) { + /** @type {number | null} */ + let timeoutId = null; + let lastDispatch = null; + + return (...args) => { + doEveryTime(...args); + + const now = Date.now(); + if (lastDispatch === null) { + // This is the first call to the function. + lastDispatch = now; + } + + const timeLeft = lastDispatch + window.DEBOUNCE_DELAY - now; + + // Always discard the old timeout, either the function will run, or a new + // timer will be scheduled. + clearTimeout(timeoutId); + + if (timeLeft <= 0) { + // It's been long enough to go ahead and call the function. + timeoutId = null; + lastDispatch = null; + window.DEBOUNCE_RUN_COUNT += 1; + onDebounce(...args); + return; + } + + // Re-set the timeout with the current time left. + clearTimeout(timeoutId); + + timeoutId = setTimeout(() => { + // Timeout ended, call the function. + timeoutId = null; + lastDispatch = null; + window.DEBOUNCE_RUN_COUNT += 1; + onDebounce(...args); + }, timeLeft); + }; +} |