Adding upstream version 115.7.0esr.upstream/115.7.0esr

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 19:33:14 +0000
commit: 36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree: 105e8c98ddea1c1e4784a60a5a6410fa416be2de /toolkit/components/translations/content/translations-document.sys.mjs
parent: Initial commit. (diff)
download: firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
1 files changed, 1284 insertions, 0 deletions
diff --git a/toolkit/components/translations/content/translations-document.sys.mjs b/toolkit/components/translations/content/translations-document.sys.mjs
new file mode 100644
index 0000000000..c1c883dbc8
--- /dev/null
+++ b/toolkit/components/translations/content/translations-document.sys.mjs
@@ -0,0 +1,1284 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  setTimeout: "resource://gre/modules/Timer.sys.mjs",
+});
+
+XPCOMUtils.defineLazyGetter(lazy, "console", () => {
+  return console.createInstance({
+    maxLogLevelPref: "browser.translations.logLevel",
+    prefix: "Translations",
+  });
+});
+
+/**
+ * Map the NodeFilter enums that are used by the TreeWalker into enums that make
+ * sense for determining the status of the nodes for the TranslationsDocument process.
+ * This aligns the meanings of the filtering for the translations process.
+ */
+const NodeStatus = {
+  // This node is ready to translate as is.
+  READY_TO_TRANSLATE: NodeFilter.FILTER_ACCEPT,
+
+  // This node contains too many block elements and needs to be subdivided further.
+  SUBDIVIDE_FURTHER: NodeFilter.FILTER_SKIP,
+
+  // This node should not be considered for translation.
+  NOT_TRANSLATABLE: NodeFilter.FILTER_REJECT,
+};
+
+/**
+ * @typedef {import("../translations").NodeVisibility} NodeVisibility
+ * @typedef {(message: string) => Promise<string>} TranslationFunction
+ */
+
+/**
+ * How often the DOM is updated with translations, in milliseconds.
+ */
+const DOM_UPDATE_INTERVAL_MS = 50;
+
+/**
+ * These tags are excluded from translation.
+ */
+const EXCLUDED_TAGS = new Set([
+  // The following are elements that semantically should not be translated.
+  "CODE",
+  "KBD",
+  "SAMP",
+  "VAR",
+  "ACRONYM",
+
+  // The following are deprecated tags.
+  "DIR",
+  "APPLET",
+
+  // The following are embedded elements, and are not supported (yet).
+  "SVG",
+  "MATH",
+  "EMBED",
+  "OBJECT",
+  "IFRAME",
+
+  // These are elements that are treated as opaque by Firefox which causes their
+  // innerHTML property to be just the raw text node behind it. Any text that is sent as
+  // HTML must be valid, and there is no guarantee that the innerHTML is valid.
+  "NOSCRIPT",
+  "NOEMBED",
+  "NOFRAMES",
+
+  // The title is handled separately, and a HEAD tag should not be considered.
+  "HEAD",
+
+  // These are not user-visible tags.
+  "STYLE",
+  "SCRIPT",
+  "TEMPLATE",
+
+  // Textarea elements contain user content, which should not be translated.
+  "TEXTAREA",
+]);
+
+// Tags that are treated as assumed inline. This list has been created by heuristics
+// and excludes some commonly inline tags, due to how they are used practically.
+//
+// An actual list of inline elements is available here:
+// https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements#list_of_inline_elements
+const INLINE_TAGS = new Set([
+  "ABBR",
+  "B",
+  "CODE",
+  "DEL",
+  "EM",
+  "I",
+  "INS",
+  "KBD",
+  "MARK",
+  "MATH",
+  "OUTPUT",
+  "Q",
+  "RUBY",
+  "SMALL",
+  "STRONG",
+  "SUB",
+  "SUP",
+  "TIME",
+  "U",
+  "VAR",
+  "WBR",
+
+  // These are not really inline, but bergamot-translator treats these as
+  // sentence-breaking.
+  "BR",
+  "TD",
+  "TH",
+  "LI",
+]);
+
+/**
+ * Tags that can't reliably be assumed to be inline or block elements. They default
+ * to inline, but are often used as block elements.
+ */
+const GENERIC_TAGS = new Set(["A", "SPAN"]);
+
+/**
+ * This class manages the process of translating the DOM from one language to another.
+ * A translateHTML and a translateText function are injected into the constructor. This
+ * class is responsible for subdividing a Node into small enough pieces to where it
+ * contains a reasonable amount of text and inline elements for the translations engine
+ * to translate. Once a node has been identified as a small enough chunk, its innerHTML
+ * is read, and sent for translation. The async translation result comes back as an HTML
+ * string. The DOM node is updated with the new text and potentially changed DOM ordering.
+ *
+ * This class also handles mutations of the DOM and will translate nodes as they are added
+ * to the page, or the when the node's text is changed by content scripts.
+ */
+export class TranslationsDocument {
+  /**
+   * The BCP 47 language tag that is used on the page.
+   *
+   * @type {string} */
+  documentLanguage;
+
+  /**
+   * The timeout between the first translation received and the call to update the DOM
+   * with translations.
+   */
+  #updateTimeout = null;
+
+  /**
+   * The nodes that need translations. They are queued when the document tree is walked,
+   * and then they are dispatched for translation based on their visibility. The viewport
+   * nodes are given the highest priority.
+   *
+   * @type {Map<Node, NodeVisibility>}
+   */
+  #queuedNodes = new Map();
+
+  /**
+   * The count of how many pending translations have been sent to the translations
+   * engine.
+   */
+  #pendingTranslationsCount = 0;
+
+  /**
+   * The list of nodes that need updating with the translated HTML. These are batched
+   * into an update.
+   *
+   * @type {Set<{ node: Node, translatedHTML: string }}
+   */
+  #nodesWithTranslatedHTML = new Set();
+
+  /**
+   * The set of nodes that have been subdivided and processed for translation. They
+   * should not be submitted again unless their contents have been changed.
+   *
+   * @type {WeakSet<Node>}
+   */
+  #processedNodes = new WeakSet();
+
+  /**
+   * All root elements we're trying to translate. This should be the `document.body`
+   * and the the `title` element.
+   *
+   * @type {Set<Node>}
+   */
+  #rootNodes = new Set();
+
+  /**
+   * This promise gets resolved when the initial viewport translations are done.
+   * This is a key user-visible performance metric. It represents what the user
+   * actually sees.
+   *
+   * @type {Promise<void> | null}
+   */
+  viewportTranslated = null;
+
+  /**
+   * Construct a new TranslationsDocument. It is tied to a specific Document and cannot
+   * be re-used. The translation functions are injected since this class shouldn't
+   * manage the life cycle of the translations engines.
+   *
+   * @param {Document} document
+   * @param {string} documentLanguage - The BCP 47 language tag.
+   * @param {number} innerWindowId - This is used for better profiler marker reporting.
+   * @param {TranslationFunction} translateHTML
+   * @param {TranslationFunction} translateText
+   */
+  constructor(
+    document,
+    documentLanguage,
+    innerWindowId,
+    translateHTML,
+    translateText
+  ) {
+    /**
+     * The language of the document. If elements are found that do not match this language,
+     * then they are skipped.
+     *
+     * @type {string}
+     */
+    this.documentLanguage = documentLanguage;
+    if (documentLanguage.length !== 2) {
+      throw new Error(
+        "Expected the language to be a valid 2 letter BCP 47 language tag: " +
+          documentLanguage
+      );
+    }
+
+    /** @type {TranslationFunction} */
+    this.translateHTML = translateHTML;
+
+    /** @type {TranslationFunction} */
+    this.translateText = translateText;
+
+    /** @type {number} */
+    this.innerWindowId = innerWindowId;
+
+    /** @type {DOMParser} */
+    this.domParser = new document.ownerGlobal.DOMParser();
+
+    /**
+     * This selector runs to find child nodes that should be excluded. It should be
+     * basically the same implementation of `isExcludedNode`, but as a selector.
+     *
+     * @type {string}
+     */
+    this.excludedNodeSelector = [
+      // Use: [lang|=value] to match language codes.
+      //
+      // Per: https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors
+      //
+      // The elements with an attribute name of attr whose value can be exactly
+      // value or can begin with value immediately followed by a hyphen, - (U+002D).
+      // It is often used for language subcode matches.
+      `[lang]:not([lang|="${this.documentLanguage}"])`,
+      `[translate=no]`,
+      `.notranslate`,
+      `[contenteditable="true"]`,
+      `[contenteditable=""]`,
+      [...EXCLUDED_TAGS].join(","),
+    ].join(",");
+
+    this.observer = new document.ownerGlobal.MutationObserver(mutationsList => {
+      for (const mutation of mutationsList) {
+        switch (mutation.type) {
+          case "childList":
+            for (const node of mutation.addedNodes) {
+              this.#processedNodes.delete(node);
+              this.subdivideNodeForTranslations(node);
+            }
+            break;
+          case "characterData":
+            this.#processedNodes.delete(mutation);
+            this.subdivideNodeForTranslations(mutation.target);
+            break;
+          default:
+            break;
+        }
+      }
+    });
+  }
+
+  /**
+   * Add a new element to start translating. This root is tracked for mutations and
+   * kept up to date with translations. This will be the body element and title tag
+   * for the document.
+   *
+   * @param {Element} [node]
+   */
+  addRootElement(node) {
+    if (!node) {
+      return;
+    }
+
+    if (node.nodeType !== Node.ELEMENT_NODE) {
+      // This node is not an element, do not add it.
+      return;
+    }
+
+    if (this.#rootNodes.has(node)) {
+      // Exclude nodes that are already targetted.
+      return;
+    }
+
+    this.#rootNodes.add(node);
+
+    this.subdivideNodeForTranslations(node);
+
+    this.observer.observe(node, {
+      characterData: true,
+      childList: true,
+      subtree: true,
+    });
+  }
+
+  /**
+   * Start walking down through a node's subtree and decide which nodes to queue for
+   * translation. This first node could be the root nodes of the DOM, such as the
+   * document body, or the title element, or it could be a mutation target.
+   *
+   * The nodes go through a process of subdivision until an appropriate sized chunk
+   * of inline text can be found.
+   *
+   * @param {Node} node
+   */
+  subdivideNodeForTranslations(node) {
+    if (!this.#rootNodes.has(node)) {
+      // This is a non-root node, which means it came from a mutation observer.
+      // Ensure that it is a valid node to translate by checking all of its ancestors.
+      for (let parent of getAncestorsIterator(node)) {
+        if (
+          this.determineTranslationStatus(parent) ===
+          NodeStatus.NOT_TRANSLATABLE
+        ) {
+          return;
+        }
+      }
+    }
+
+    switch (this.determineTranslationStatusForUnprocessedNodes(node)) {
+      case NodeStatus.NOT_TRANSLATABLE:
+        // This node is rejected as it shouldn't be translated.
+        return;
+
+      case NodeStatus.READY_TO_TRANSLATE:
+        // This node is ready for translating, and doesn't need to be subdivided. There
+        // is no reason to run the TreeWalker, it can be directly submitted for
+        // translation.
+        this.queueNodeForTranslation(node);
+        break;
+
+      case NodeStatus.SUBDIVIDE_FURTHER:
+        // This node may be translatable, but it needs to be subdivided into smaller
+        // pieces. Create a TreeWalker to walk the subtree, and find the subtrees/nodes
+        // that contain enough inline elements to send to be translated.
+        {
+          const nodeIterator = node.ownerDocument.createTreeWalker(
+            node,
+            NodeFilter.SHOW_ELEMENT,
+            this.determineTranslationStatusForUnprocessedNodes
+          );
+
+          // This iterator will contain each node that has been subdivided enough to
+          // be translated.
+          let currentNode;
+          while ((currentNode = nodeIterator.nextNode())) {
+            this.queueNodeForTranslation(currentNode);
+          }
+        }
+        break;
+    }
+
+    if (node.nodeName === "BODY") {
+      this.reportWordsInViewport();
+    }
+    this.dispatchQueuedTranslations();
+  }
+
+  /**
+   * Test whether this is an element we do not want to translate. These are things like
+   * <code> elements, elements with a different "lang" attribute, and elements that
+   * have a `translate=no` attribute.
+   *
+   * @param {Node} node
+   */
+  isExcludedNode(node) {
+    // Property access be expensive, so destructure required properties so they are
+    // not accessed multiple times.
+    const { nodeType } = node;
+
+    if (nodeType === Node.TEXT_NODE) {
+      // Text nodes are never excluded.
+      return false;
+    }
+    if (nodeType !== Node.ELEMENT_NODE) {
+      // Only elements and and text nodes should be considered.
+      return true;
+    }
+
+    const { nodeName } = node;
+
+    if (EXCLUDED_TAGS.has(nodeName)) {
+      // This is an excluded tag.
+      return true;
+    }
+
+    if (!this.matchesDocumentLanguage(node)) {
+      // Exclude nodes that don't match the fromLanguage.
+      return true;
+    }
+
+    if (node.getAttribute("translate") === "no") {
+      // This element has a translate="no" attribute.
+      // https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/translate
+      return true;
+    }
+
+    if (node.classList.contains("notranslate")) {
+      // Google Translate skips translations if the classList contains "notranslate"
+      // https://cloud.google.com/translate/troubleshooting
+      return true;
+    }
+
+    if (node.isContentEditable) {
+      // This field is editable, and so exclude it similar to the way that form input
+      // fields are excluded.
+      return true;
+    }
+
+    return false;
+  }
+
+  /**
+   * Runs `determineTranslationStatus`, but only on unprocessed nodes.
+   *
+   * @param {Node} node
+   * @return {number} - One of the NodeStatus values.
+   */
+  determineTranslationStatusForUnprocessedNodes = node => {
+    if (this.#processedNodes.has(node)) {
+      // Skip nodes that have already been processed.
+      return NodeStatus.NOT_TRANSLATABLE;
+    }
+
+    return this.determineTranslationStatus(node);
+  };
+
+  /**
+   * Determines if a node should be submitted for translation, not translatable, or if
+   * it should be subdivided further. It doesn't check if the node has already been
+   * processed.
+   *
+   * The return result works as a TreeWalker NodeFilter as well.
+   *
+   * @param {Node} node
+   * @returns {number} - One of the `NodeStatus` values. See that object
+   *   for documentation. These values match the filters for the TreeWalker.
+   *   These values also work as a `NodeFilter` value.
+   */
+  determineTranslationStatus(node) {
+    if (isNodeQueued(node, this.#queuedNodes)) {
+      // This node or its parent was already queued, reject it.
+      return NodeStatus.NOT_TRANSLATABLE;
+    }
+
+    if (this.isExcludedNode(node)) {
+      // This is an explicitly excluded node.
+      return NodeStatus.NOT_TRANSLATABLE;
+    }
+
+    if (node.textContent.trim().length === 0) {
+      // Do not use subtrees that are empty of text. This textContent call is fairly
+      // expensive.
+      return NodeStatus.NOT_TRANSLATABLE;
+    }
+
+    if (nodeNeedsSubdividing(node)) {
+      // Skip this node, and dig deeper into its tree to cut off smaller pieces
+      // to translate. It is presumed to be a wrapper of block elements.
+      return NodeStatus.SUBDIVIDE_FURTHER;
+    }
+
+    if (
+      containsExcludedNode(node, this.excludedNodeSelector) &&
+      !hasTextNodes(node)
+    ) {
+      // Skip this node, and dig deeper into its tree to cut off smaller pieces
+      // to translate.
+      return NodeStatus.SUBDIVIDE_FURTHER;
+    }
+
+    // This node can be treated as entire block to submit for translation.
+    return NodeStatus.READY_TO_TRANSLATE;
+  }
+
+  /**
+   * Queue a node for translation.
+   * @param {Node} node
+   */
+  queueNodeForTranslation(node) {
+    /** @type {NodeVisibility} */
+    let visibility = "out-of-viewport";
+    if (isNodeHidden(node)) {
+      visibility = "hidden";
+    } else if (isNodeInViewport(node)) {
+      visibility = "in-viewport";
+    }
+
+    this.#queuedNodes.set(node, visibility);
+  }
+
+  /**
+   * Submit the translations giving priority to nodes in the viewport.
+   */
+  async dispatchQueuedTranslations() {
+    let inViewportCounts = 0;
+    let outOfViewportCounts = 0;
+    let hiddenCounts = 0;
+
+    let inViewportTranslations;
+    if (!this.viewportTranslated) {
+      inViewportTranslations = [];
+    }
+
+    for (const [node, visibility] of this.#queuedNodes) {
+      if (visibility === "in-viewport") {
+        inViewportCounts++;
+        const promise = this.submitTranslation(node);
+        if (inViewportTranslations) {
+          inViewportTranslations.push(promise);
+        }
+      }
+    }
+    for (const [node, visibility] of this.#queuedNodes) {
+      if (visibility === "out-of-viewport") {
+        outOfViewportCounts++;
+        this.submitTranslation(node);
+      }
+    }
+    for (const [node, visibility] of this.#queuedNodes) {
+      if (visibility === "hidden") {
+        hiddenCounts++;
+        this.submitTranslation(node);
+      }
+    }
+
+    ChromeUtils.addProfilerMarker(
+      "Translations",
+      { innerWindowId: this.innerWindowId },
+      `Translate ${this.#queuedNodes.size} nodes.\n\n` +
+        `In viewport: ${inViewportCounts}\n` +
+        `Out of viewport: ${outOfViewportCounts}\n` +
+        `Hidden: ${hiddenCounts}\n`
+    );
+
+    this.#queuedNodes.clear();
+
+    if (!this.viewportTranslated && inViewportTranslations) {
+      // Provide a promise that can be used to determine when the initial viewport has
+      // been translated. This is a key user-visible metric.
+      this.viewportTranslated = Promise.allSettled(inViewportTranslations);
+    }
+  }
+
+  /**
+   * Record how many words were in the viewport, as this is the most important
+   * user-visible translation content.
+   */
+  reportWordsInViewport() {
+    if (
+      // This promise gets created for the first dispatchQueuedTranslations
+      this.viewportTranslated ||
+      this.#queuedNodes.size === 0
+    ) {
+      return;
+    }
+
+    // TODO(Bug 1814195) - Add telemetry.
+    // TODO(Bug 1820618) - This whitespace regex will not work in CJK-like languages.
+    // This requires a segmenter for a proper implementation.
+
+    const whitespace = /\s+/;
+    let wordCount = 0;
+    for (const [node, visibility] of this.#queuedNodes) {
+      if (visibility === "in-viewport") {
+        wordCount += node.textContent.trim().split(whitespace).length;
+      }
+    }
+
+    const message = wordCount + " words are in the viewport.";
+    lazy.console.log(message);
+    ChromeUtils.addProfilerMarker(
+      "Translations",
+      { innerWindowId: this.innerWindowId },
+      message
+    );
+  }
+
+  /**
+   * Submit a node for translation to the translations engine.
+   *
+   * @param {Node} node
+   * @returns {Promise<void>}
+   */
+  async submitTranslation(node) {
+    // Give each element an id that gets passed through the translation so it can be
+    // reunited later on.
+    if (node.nodeType === Node.ELEMENT_NODE) {
+      node.querySelectorAll("*").forEach((el, i) => {
+        el.dataset.mozTranslationsId = i;
+      });
+    }
+
+    let text, translate;
+    if (node.nodeType === Node.ELEMENT_NODE) {
+      text = node.innerHTML;
+      translate = this.translateHTML;
+    } else {
+      text = node.textContent;
+      translate = this.translateText;
+    }
+
+    if (text.trim().length === 0) {
+      return;
+    }
+
+    // Mark this node as not to be translated again unless the contents are changed
+    // (which the observer will pick up on)
+    this.#processedNodes.add(node);
+
+    this.#pendingTranslationsCount++;
+    try {
+      const [translatedHTML] = await translate(text);
+      this.#pendingTranslationsCount--;
+      this.scheduleNodeUpdateWithTranslation(node, translatedHTML);
+    } catch (error) {
+      this.#pendingTranslationsCount--;
+      lazy.console.error("Translation failed", error);
+    }
+  }
+
+  /**
+   * Start the mutation observer, for instance after applying the translations to the DOM.
+   */
+  startMutationObserver() {
+    if (Cu.isDeadWrapper(this.observer)) {
+      // This observer is no longer alive.
+      return;
+    }
+    for (const node of this.#rootNodes) {
+      if (Cu.isDeadWrapper(node)) {
+        // This node is no longer alive.
+        continue;
+      }
+      this.observer.observe(node, {
+        characterData: true,
+        childList: true,
+        subtree: true,
+      });
+    }
+  }
+
+  /**
+   * Stop the mutation observer, for instance to apply the translations to the DOM.
+   */
+  stopMutationObserver() {
+    // Was the window already destroyed?
+    if (!Cu.isDeadWrapper(this.observer)) {
+      this.observer.disconnect();
+    }
+  }
+
+  /**
+   * This is called every `DOM_UPDATE_INTERVAL_MS` ms with translations for nodes.
+   *
+   * This function is called asynchronously, so nodes may already be dead. Before
+   * accessing a node make sure and run `Cu.isDeadWrapper` to check that it is alive.
+   */
+  updateNodesWithTranslations() {
+    // Stop the mutations so that the updates won't trigger observations.
+    this.stopMutationObserver();
+
+    for (const { node, translatedHTML } of this.#nodesWithTranslatedHTML) {
+      if (Cu.isDeadWrapper(node)) {
+        // The node is no longer alive.
+        ChromeUtils.addProfilerMarker(
+          "Translations",
+          { innerWindowId: this.innerWindowId },
+          "Node is no long alive."
+        );
+        continue;
+      }
+      switch (node.nodeType) {
+        case Node.TEXT_NODE: {
+          if (translatedHTML.trim().length !== 0) {
+            // Only update the node if there is new text.
+            node.textContent = translatedHTML;
+          }
+          break;
+        }
+        case Node.ELEMENT_NODE: {
+          // TODO (Bug 1820625) - This is slow compared to the original implementation
+          // in the addon which set the innerHTML directly. We can't set the innerHTML
+          // here, but perhaps there is another way to get back some of the performance.
+          const translationsDocument = this.domParser.parseFromString(
+            `<!DOCTYPE html><div>${translatedHTML}</div>`,
+            "text/html"
+          );
+          updateElement(translationsDocument, node);
+          break;
+        }
+      }
+    }
+
+    this.#nodesWithTranslatedHTML.clear();
+    this.#updateTimeout = null;
+
+    // Done mutating the DOM.
+    this.startMutationObserver();
+  }
+
+  /**
+   * Schedule a node to be updated with a translation.
+   *
+   * @param {Node} node
+   * @param {string} translatedHTML
+   */
+  scheduleNodeUpdateWithTranslation(node, translatedHTML) {
+    // Add the nodes to be populated with the next translation update.
+    this.#nodesWithTranslatedHTML.add({ node, translatedHTML });
+
+    if (this.#pendingTranslationsCount === 0) {
+      // No translations are pending, update the node.
+      this.updateNodesWithTranslations();
+    } else if (!this.#updateTimeout) {
+      // Schedule an update.
+      this.#updateTimeout = lazy.setTimeout(
+        this.updateNodesWithTranslations.bind(this),
+        DOM_UPDATE_INTERVAL_MS
+      );
+    } else {
+      // An update has been previously scheduled, do nothing here.
+    }
+  }
+
+  /**
+   * Check to see if a language matches the document language.
+   *
+   * @param {Node} node
+   */
+  matchesDocumentLanguage(node) {
+    if (!node.lang) {
+      // No `lang` was present, so assume it matches the language.
+      return true;
+    }
+
+    // First, cheaply check if language tags match, without canonicalizing.
+    if (langTagsMatch(this.documentLanguage, node.lang)) {
+      return true;
+    }
+
+    try {
+      // Make sure the local is in the canonical form, and check again. This function
+      // throws, so don't trust that the language tags are formatting correctly.
+      const [language] = Intl.getCanonicalLocales(node.lang);
+
+      return langTagsMatch(this.documentLanguage, language);
+    } catch (_error) {
+      return false;
+    }
+  }
+}
+
+/**
+ * This function needs to be fairly fast since it's used on many nodes when iterating
+ * over the DOM to find nodes to translate.
+ *
+ * @param {Text | HTMLElement} node
+ */
+function isNodeHidden(node) {
+  /** @type {HTMLElement} */
+  const element = node.nodeType === Node.TEXT_NODE ? node.parentElement : node;
+
+  // This flushes the style, which is a performance cost.
+  const style = element.ownerGlobal.getComputedStyle(element);
+  return style.display === "none" || style.visibility === "hidden";
+}
+
+/**
+ * This function cheaply checks that language tags match.
+ *
+ * @param {string} knownLanguage
+ * @param {string} otherLanguage
+ */
+function langTagsMatch(knownLanguage, otherLanguage) {
+  if (knownLanguage === otherLanguage) {
+    // A simple direct match.
+    return true;
+  }
+  if (knownLanguage.length !== 2) {
+    throw new Error("Expected the knownLanguage to be of length 2.");
+  }
+  // Check if the language tags part match, e.g. "en" and "en-US".
+  return (
+    knownLanguage[0] === otherLanguage[0] &&
+    knownLanguage[1] === otherLanguage[1] &&
+    otherLanguage[2] === "-"
+  );
+}
+
+/**
+ * This function runs when walking the DOM, which means it is a hot function. It runs
+ * fairly fast even though it is computing the bounding box. This is all done in a tight
+ * loop, and it is done on mutations. Care should be taken with reflows caused by
+ * getBoundingClientRect, as this is a common performance issue.
+ *
+ * The following are the counts of how often this is run on a news site:
+ *
+ * Given:
+ *  1573 DOM nodes
+ *  504 Text nodes
+ *  1069 Elements
+ *
+ * There were:
+ *  209 calls to get this funcion.
+ *
+ * @param {Node} node
+ */
+function isNodeInViewport(node) {
+  const window = node.ownerGlobal;
+  const document = node.ownerDocument;
+
+  /** @type {HTMLElement} */
+  const element = node.nodeType === Node.TEXT_NODE ? node.parentElement : node;
+
+  const rect = element.getBoundingClientRect();
+  return (
+    rect.top >= 0 &&
+    rect.left >= 0 &&
+    rect.bottom <=
+      (window.innerHeight || document.documentElement.clientHeight) &&
+    rect.right <= (window.innerWidth || document.documentElement.clientWidth)
+  );
+}
+
+/**
+ * Actually perform the update of the element with the translated node. This step
+ * will detach all of the "live" nodes, and match them up in the correct order as provided
+ * by the translations engine.
+ *
+ * @param {Document} translationsDocument
+ * @param {Element} element
+ * @returns {void}
+ */
+function updateElement(translationsDocument, element) {
+  // This text should have the same layout as the target, but it's not completely
+  // guaranteed since the content page could change at any time, and the translation process is async.
+  //
+  // The document has the following structure:
+  //
+  // <html>
+  //   <head>
+  //   <body>{translated content}</body>
+  // </html>
+
+  const originalHTML = element.innerHTML;
+
+  /**
+   * The Set of translation IDs for nodes that have been cloned.
+   * @type {Set<number>}
+   */
+  const clonedNodes = new Set();
+
+  merge(element, translationsDocument.body.firstChild);
+
+  /**
+   * Merge the live tree with the translated tree by re-using elements from the live tree.
+   *
+   * @param {Node} liveTree
+   * @param {Node} translatedTree
+   */
+  function merge(liveTree, translatedTree) {
+    /** @type {Map<number, Element>} */
+    const liveElementsById = new Map();
+
+    /** @type {Array<Text>} */
+    const liveTextNodes = [];
+
+    // Remove all the nodes from the liveTree, and categorize them by Text node or
+    // Element node.
+    let node;
+    while ((node = liveTree.firstChild)) {
+      node.remove();
+
+      if (node.nodeType === Node.ELEMENT_NODE) {
+        liveElementsById.set(node.dataset.mozTranslationsId, node);
+      } else if (node.nodeType === Node.TEXT_NODE) {
+        liveTextNodes.push(node);
+      }
+    }
+
+    // The translated tree dictates the order.
+    const translatedNodes = translatedTree.childNodes;
+    for (
+      let translatedIndex = 0;
+      translatedIndex < translatedNodes.length;
+      translatedIndex++
+    ) {
+      const translatedNode = translatedNodes[translatedIndex];
+
+      if (translatedNode.nodeType === Node.TEXT_NODE) {
+        // Copy the translated text to the original Text node and re-append it.
+        let liveTextNode = liveTextNodes.shift();
+
+        if (liveTextNode) {
+          liveTextNode.data = translatedNode.data;
+        } else {
+          liveTextNode = translatedNode;
+        }
+
+        liveTree.appendChild(liveTextNode);
+      } else if (translatedNode.nodeType === Node.ELEMENT_NODE) {
+        const translationsId = translatedNode.dataset.mozTranslationsId;
+        // Element nodes try to use the already existing DOM nodes.
+
+        // Find the element in the live tree that matches the one in the translated tree.
+        let liveElement = liveElementsById.get(translationsId);
+
+        if (!liveElement) {
+          lazy.console.warn("Could not find a corresponding live element", {
+            path: createNodePath(translatedNode, translationsDocument.body),
+            translationsId,
+            liveElementsById,
+            translatedNode,
+          });
+          continue;
+        }
+
+        // Has this element already been added to the list? Then duplicate it and re-add
+        // it as a clone. The Translations Engine can sometimes duplicate HTML.
+        if (liveElement.parentNode) {
+          liveElement = liveElement.cloneNode(true /* deep clone */);
+          clonedNodes.add(translationsId);
+          lazy.console.warn(
+            "Cloning a node because it was already inserted earlier",
+            {
+              path: createNodePath(translatedNode, translationsDocument.body),
+              translatedNode,
+              liveElement,
+            }
+          );
+        }
+
+        if (isNodeTextEmpty(translatedNode)) {
+          // The original node had text, but the one that came out of translation
+          // didn't have any text. This scenario might be caused by one of two causes:
+          //
+          //   1) The element was duplicated by translation but then not given text
+          //      content. This happens on Wikipedia articles for example.
+          //
+          //   2) The translator messed up and could not translate the text. This
+          //      happens on YouTube in the language selector. In that case, having the
+          //      original text is much better than no text at all.
+          //
+          // To make sure it is case 1 and not case 2 check whether this is the only occurrence.
+          for (let i = 0; i < translatedNodes.length; i++) {
+            if (translatedIndex === i) {
+              // This is the current node, not a sibling.
+              continue;
+            }
+            const sibling = translatedNodes[i];
+            if (
+              // Only consider other element nodes.
+              sibling.nodeType === Node.ELEMENT_NODE &&
+              // If the sibling's translationsId matches, then use the sibling's
+              // node instead.
+              translationsId === sibling.dataset.mozTranslationsId
+            ) {
+              // This is case 1 from above. Remove this element's original text nodes,
+              // since a sibling text node now has all of the text nodes.
+              removeTextNodes(liveElement);
+            }
+          }
+
+          // Report this issue to the console.
+          lazy.console.warn(
+            "The translated element has no text even though the original did.",
+            {
+              path: createNodePath(translatedNode, translationsDocument.body),
+              translatedNode,
+              liveElement,
+            }
+          );
+        } else if (!isNodeTextEmpty(liveElement)) {
+          // There are still text nodes to find and update, recursively merge.
+          merge(liveElement, translatedNode);
+        }
+
+        // Put the live node back in the live branch. But now t has been synced with the
+        // translated text and order.
+        liveTree.appendChild(liveElement);
+      }
+    }
+
+    const unhandledElements = [...liveElementsById].filter(
+      ([, element]) => !element.parentNode
+    );
+
+    if (unhandledElements.length) {
+      lazy.console.warn(
+        `${createNodePath(
+          translatedTree,
+          translationsDocument.body
+        )} Not all nodes unified`,
+        {
+          unhandledElements,
+          clonedNodes,
+          originalHTML,
+          translatedHTML: translationsDocument.body.innerHTML,
+          liveTree: liveTree.outerHTML,
+          translatedTree: translatedTree.outerHTML,
+        }
+      );
+    }
+  }
+}
+
+/**
+ * For debug purposes, compute a string path to an element.
+ *
+ * e.g. "div/div#header/p.bold.string/a"
+ *
+ * @param {Node} node
+ * @param {Node | null} root
+ */
+function createNodePath(node, root) {
+  if (root === null) {
+    root = node.ownerDocument.body;
+  }
+  let path =
+    node.parentNode && node.parentNode !== root
+      ? createNodePath(node.parentNode)
+      : "";
+  path += `/${node.nodeName}`;
+  if (node.id) {
+    path += `#${node.id}`;
+  } else if (node.className) {
+    for (const className of node.classList) {
+      path += "." + className;
+    }
+  }
+  return path;
+}
+
+/**
+ * @param {Node} node
+ * @returns {boolean}
+ */
+function isNodeTextEmpty(node) {
+  if ("innerText" in node) {
+    return node.innerText.trim().length === 0;
+  }
+  if (node.nodeType === Node.TEXT_NODE && node.nodeValue) {
+    return node.nodeValue.trim().length === 0;
+  }
+  return true;
+}
+
+/**
+ * @param {Node} node
+ */
+function removeTextNodes(node) {
+  for (const child of node.childNodes) {
+    switch (child.nodeType) {
+      case Node.TEXT_NODE:
+        node.removeChild(child);
+        break;
+      case Node.ELEMENT_NODE:
+        removeTextNodes(child);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+/**
+ * Test whether any of the direct child text nodes of are non-whitespace
+ * text nodes.
+ *
+ * For example:
+ *   - `<p>test</p>`: yes
+ *   - `<p> </p>`: no
+ *   - `<p><b>test</b></p>`: no
+ * @param {Node} node
+ * @returns {boolean}
+ */
+function hasTextNodes(node) {
+  if (node.nodeType !== Node.ELEMENT_NODE) {
+    // Only check element nodes.
+    return false;
+  }
+
+  for (const child of node.childNodes) {
+    if (child.nodeType === Node.TEXT_NODE) {
+      if (child.textContent.trim() === "") {
+        // This is just whitespace.
+        continue;
+      }
+      // A text node with content was found.
+      return true;
+    }
+  }
+
+  // No text nodes were found.
+  return false;
+}
+
+/**
+ * Like `isExcludedNode` but looks at the full subtree. Used to see whether
+ * we can submit a subtree, or whether we should split it into smaller
+ * branches first to try to exclude more of the non-translatable content.
+ *
+ * @param {Node} node
+ * @param {string} excludedNodeSelector
+ * @returns {boolean}
+ */
+function containsExcludedNode(node, excludedNodeSelector) {
+  return (
+    node.nodeType === Node.ELEMENT_NODE &&
+    node.querySelector(excludedNodeSelector)
+  );
+}
+
+/**
+ * Check if this node has already been queued to be translated. This can be because
+ * the node is itself is queued, or its parent node is queued.
+ *
+ * @param {Node} node
+ * @param {Map<Node, any>} queuedNodes
+ * @returns {boolean}
+ */
+function isNodeQueued(node, queuedNodes) {
+  if (queuedNodes.has(node)) {
+    return true;
+  }
+
+  // If the immediate parent is the body, it is allowed.
+  if (node.parentNode === node.ownerDocument.body) {
+    return false;
+  }
+
+  // Accessing the parentNode is expensive here according to performance profilling. This
+  // is due to XrayWrappers. Minimize reading attributes by storing a reference to the
+  // `parentNode` in a named variable, rather than re-accessing it.
+  let parentNode;
+  let lastNode = node;
+  while ((parentNode = lastNode.parentNode)) {
+    if (queuedNodes.has(parentNode)) {
+      return parentNode;
+    }
+    lastNode = parentNode;
+  }
+
+  return false;
+}
+
+/**
+ * Test whether this node should be treated as a wrapper of text, e.g.
+ * a `<p>`, or as a wrapper for block elements, e.g. `<div>`, based on
+ * its ratio of assumed inline elements, and assumed "block" elements. If it is a wrapper
+ * of block elements, then it needs more subdividing. This algorithm is based on
+ * heuristics and is a best effort attempt at sorting contents without actually computing
+ * the style of every element.
+ *
+ * If it's a Text node, it's inline and doesn't need subdividing.
+ *
+ *  "Lorem ipsum"
+ *
+ * If it is mostly filled with assumed "inline" elements, treat it as inline.
+ *   <p>
+ *     Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+ *     <b>Nullam ut finibus nibh</b>, at tincidunt tellus.
+ *   </p>
+ *
+ *   Since it has 3 "inline" elements.
+ *     1. "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+ *     2. <b>Nullam ut finibus nibh</b>
+ *     3. ", at tincidunt tellus."
+ *
+ * If it's mostly filled with block elements, do not treat it as inline, as it will
+ * need more subdividing.
+ *
+ *   <section>
+ *     Lorem ipsum <strong>dolor sit amet.</strong>
+ *     <div>Nullam ut finibus nibh, at tincidunt tellus.</div>
+ *     <div>Morbi pharetra mauris sed nisl mollis molestie.</div>
+ *     <div>Donec et nibh sit amet velit tincidunt auctor.</div>
+ *   </section>
+ *
+ *   This node has 2 presumed "inline" elements:
+ *       1 "Lorem ipsum"
+ *       2. <strong>dolor sit amet.</strong>.
+ *
+ *   And the 3 div "block" elements. Since 3 "block" elements > 2 "inline" elements,
+ *   it is presumed to be "inline".
+ *
+ * @param {Node} node
+ * @returns {boolean}
+ */
+function nodeNeedsSubdividing(node) {
+  if (node.nodeType === Node.TEXT_NODE) {
+    // Text nodes are fully subdivided.
+    return false;
+  }
+
+  let inlineElements = 0;
+  let blockElements = 0;
+
+  if (node.nodeName === "TR") {
+    // TR elements always need subdividing, since the cells are the individual "inline"
+    // units. For instance the following would be invalid markup:
+    //
+    //   <tr>
+    //     This is <b>invalid</b>
+    //   </tr>
+    //
+    // You will always have the following, which will need more subdividing.
+    //
+    //   <tr>
+    //     <td>This is <b>valid</b>.</td>
+    //     <td>This is still valid.</td>
+    //   </tr>
+    return true;
+  }
+
+  for (let child of node.childNodes) {
+    switch (child.nodeType) {
+      case Node.TEXT_NODE:
+        if (!isNodeTextEmpty(child)) {
+          inlineElements += 1;
+        }
+        break;
+      case Node.ELEMENT_NODE: {
+        // Property access can be expensive, so destructure the required properties.
+        const { nodeName } = child;
+        if (INLINE_TAGS.has(nodeName)) {
+          inlineElements += 1;
+        } else if (GENERIC_TAGS.has(nodeName) && !nodeNeedsSubdividing(child)) {
+          inlineElements += 1;
+        } else {
+          blockElements += 1;
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+
+  return inlineElements < blockElements;
+}
+
+/**
+ * Returns an iterator of a node's ancestors.
+ *
+ * @param {Node} node
+ * @returns {Generator<ParentNode>}
+ */
+function* getAncestorsIterator(node) {
+  const document = node.ownerDocument;
+  for (
+    let parent = node.parentNode;
+    parent && parent !== document.documentElement;
+    parent = parent.parentNode
+  ) {
+    yield parent;
+  }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 19:33:14 +0000
commit	36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree	105e8c98ddea1c1e4784a60a5a6410fa416be2de /toolkit/components/translations/content/translations-document.sys.mjs
parent	Initial commit. (diff)
download	firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip