summaryrefslogtreecommitdiffstats
path: root/browser/components/translation/TranslationDocument.jsm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /browser/components/translation/TranslationDocument.jsm
parentInitial commit. (diff)
downloadfirefox-upstream.tar.xz
firefox-upstream.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'browser/components/translation/TranslationDocument.jsm')
-rw-r--r--browser/components/translation/TranslationDocument.jsm677
1 files changed, 677 insertions, 0 deletions
diff --git a/browser/components/translation/TranslationDocument.jsm b/browser/components/translation/TranslationDocument.jsm
new file mode 100644
index 0000000000..46b3339832
--- /dev/null
+++ b/browser/components/translation/TranslationDocument.jsm
@@ -0,0 +1,677 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+var EXPORTED_SYMBOLS = ["TranslationDocument"];
+
+const { Async } = ChromeUtils.import("resource://services-common/async.js");
+
+/**
+ * This class represents a document that is being translated,
+ * and it is responsible for parsing the document,
+ * generating the data structures translation (the list of
+ * translation items and roots), and managing the original
+ * and translated texts on the translation items.
+ *
+ * @param document The document to be translated
+ */
+var TranslationDocument = function(document) {
+ this.itemsMap = new Map();
+ this.roots = [];
+ this._init(document);
+};
+
+TranslationDocument.prototype = {
+ translatedFrom: "",
+ translatedTo: "",
+ translationError: false,
+ originalShown: true,
+
+ /**
+ * Initializes the object and populates
+ * the roots lists.
+ *
+ * @param document The document to be translated
+ */
+ _init(document) {
+ let winUtils = document.defaultView.windowUtils;
+
+ // Get all the translation nodes in the document's body:
+ // a translation node is a node from the document which
+ // contains useful content for translation, and therefore
+ // must be included in the translation process.
+ let nodeList = winUtils.getTranslationNodes(document.body);
+
+ let length = nodeList.length;
+
+ for (let i = 0; i < length; i++) {
+ let node = nodeList.item(i);
+ let isRoot = nodeList.isTranslationRootAtIndex(i);
+
+ // Create a TranslationItem object for this node.
+ // This function will also add it to the this.roots array.
+ this._createItemForNode(node, i, isRoot);
+ }
+
+ // At first all roots are stored in the roots list, and only after
+ // the process has finished we're able to determine which roots are
+ // simple, and which ones are not.
+
+ // A simple root is defined by a root with no children items, which
+ // basically represents an element from a page with only text content
+ // inside.
+
+ // This distinction is useful for optimization purposes: we treat a
+ // simple root as plain-text in the translation process and with that
+ // we are able to reduce their data payload sent to the translation service.
+
+ for (let root of this.roots) {
+ if (!root.children.length && root.nodeRef.childElementCount == 0) {
+ root.isSimpleRoot = true;
+ }
+ }
+ },
+
+ /**
+ * Creates a TranslationItem object, which should be called
+ * for each node returned by getTranslationNodes.
+ *
+ * @param node The DOM node for this item.
+ * @param id A unique, numeric id for this item.
+ * @parem isRoot A boolean saying whether this item is a root.
+ *
+ * @returns A TranslationItem object.
+ */
+ _createItemForNode(node, id, isRoot) {
+ if (this.itemsMap.has(node)) {
+ return this.itemsMap.get(node);
+ }
+
+ let item = new TranslationItem(node, id, isRoot);
+
+ if (isRoot) {
+ // Root items do not have a parent item.
+ this.roots.push(item);
+ } else {
+ let parentItem = this.itemsMap.get(node.parentNode);
+ if (parentItem) {
+ parentItem.children.push(item);
+ }
+ }
+
+ this.itemsMap.set(node, item);
+ return item;
+ },
+
+ /**
+ * Generate the text string that represents a TranslationItem object.
+ * Besides generating the string, it's also stored in the "original"
+ * field of the TranslationItem object, which needs to be stored for
+ * later to be used in the "Show Original" functionality.
+ * If this function had already been called for the given item (determined
+ * by the presence of the "original" array in the item), the text will
+ * be regenerated from the "original" data instead of from the related
+ * DOM nodes (because the nodes might contain translated data).
+ *
+ * @param item A TranslationItem object
+ *
+ * @returns A string representation of the TranslationItem.
+ */
+ generateTextForItem(item) {
+ if (item.original) {
+ return regenerateTextFromOriginalHelper(item);
+ }
+
+ if (item.isSimpleRoot) {
+ let text = item.nodeRef.firstChild.nodeValue.trim();
+ item.original = [text];
+ return text;
+ }
+
+ let str = "";
+ item.original = [];
+ let wasLastItemPlaceholder = false;
+
+ for (let child of item.nodeRef.childNodes) {
+ if (child.nodeType == child.TEXT_NODE) {
+ let x = child.nodeValue.trim();
+ if (x != "") {
+ item.original.push(x);
+ str += x;
+ wasLastItemPlaceholder = false;
+ }
+ continue;
+ }
+
+ let objInMap = this.itemsMap.get(child);
+ if (objInMap && !objInMap.isRoot) {
+ // If this childNode is present in the itemsMap, it means
+ // it's a translation node: it has useful content for translation.
+ // In this case, we need to stringify this node.
+ // However, if this item is a root, we should skip it here in this
+ // object's child list (and just add a placeholder for it), because
+ // it will be stringfied separately for being a root.
+ item.original.push(objInMap);
+ str += this.generateTextForItem(objInMap);
+ wasLastItemPlaceholder = false;
+ } else if (!wasLastItemPlaceholder) {
+ // Otherwise, if this node doesn't contain any useful content,
+ // or if it is a root itself, we can replace it with a placeholder node.
+ // We can't simply eliminate this node from our string representation
+ // because that could change the HTML structure (e.g., it would
+ // probably merge two separate text nodes).
+ // It's not necessary to add more than one placeholder in sequence;
+ // we can optimize them away.
+ item.original.push(TranslationItem_NodePlaceholder);
+ str += "<br>";
+ wasLastItemPlaceholder = true;
+ }
+ }
+
+ return generateTranslationHtmlForItem(item, str);
+ },
+
+ /**
+ * Changes the document to display its translated
+ * content.
+ */
+ showTranslation() {
+ this.originalShown = false;
+ this._swapDocumentContent("translation");
+ },
+
+ /**
+ * Changes the document to display its original
+ * content.
+ */
+ showOriginal() {
+ this.originalShown = true;
+ this._swapDocumentContent("original");
+ },
+
+ /**
+ * Swap the document with the resulting translation,
+ * or back with the original content.
+ *
+ * @param target A string that is either "translation"
+ * or "original".
+ */
+ _swapDocumentContent(target) {
+ (async () => {
+ // Let the event loop breath on every 100 nodes
+ // that are replaced.
+ const YIELD_INTERVAL = 100;
+ await Async.yieldingForEach(
+ this.roots,
+ root => root.swapText(target),
+ YIELD_INTERVAL
+ );
+ })();
+ },
+};
+
+/**
+ * This class represents an item for translation. It's basically our
+ * wrapper class around a node returned by getTranslationNode, with
+ * more data and structural information on it.
+ *
+ * At the end of the translation process, besides the properties below,
+ * a TranslationItem will contain two other properties: one called "original"
+ * and one called "translation". They are twin objects, one which reflect
+ * the structure of that node in its original state, and the other in its
+ * translated state.
+ *
+ * The "original" array is generated in the generateTextForItem function,
+ * and the "translation" array is generated when the translation results
+ * are parsed.
+ *
+ * They are both arrays, which contain a mix of strings and references to
+ * child TranslationItems. The references in both arrays point to the * same *
+ * TranslationItem object, but they might appear in different orders between the
+ * "original" and "translation" arrays.
+ *
+ * An example:
+ *
+ * English: <div id="n1">Welcome to <b id="n2">Mozilla's</b> website</div>
+ * Portuguese: <div id="n1">Bem vindo a pagina <b id="n2">da Mozilla</b></div>
+ *
+ * TranslationItem n1 = {
+ * id: 1,
+ * original: ["Welcome to", ptr to n2, "website"]
+ * translation: ["Bem vindo a pagina", ptr to n2]
+ * }
+ *
+ * TranslationItem n2 = {
+ * id: 2,
+ * original: ["Mozilla's"],
+ * translation: ["da Mozilla"]
+ * }
+ */
+function TranslationItem(node, id, isRoot) {
+ this.nodeRef = node;
+ this.id = id;
+ this.isRoot = isRoot;
+ this.children = [];
+}
+
+TranslationItem.prototype = {
+ isRoot: false,
+ isSimpleRoot: false,
+
+ toString() {
+ let rootType = "";
+ if (this.isRoot) {
+ if (this.isSimpleRoot) {
+ rootType = " (simple root)";
+ } else {
+ rootType = " (non simple root)";
+ }
+ }
+ return (
+ "[object TranslationItem: <" +
+ this.nodeRef.localName +
+ ">" +
+ rootType +
+ "]"
+ );
+ },
+
+ /**
+ * This function will parse the result of the translation of one translation
+ * item. If this item was a simple root, all we sent was a plain-text version
+ * of it, so the result is also straightforward text.
+ *
+ * For non-simple roots, we sent a simplified HTML representation of that
+ * node, and we'll first parse that into an HTML doc and then call the
+ * parseResultNode helper function to parse it.
+ *
+ * While parsing, the result is stored in the "translation" field of the
+ * TranslationItem, which will be used to display the final translation when
+ * all items are finished. It remains stored too to allow back-and-forth
+ * switching between the "Show Original" and "Show Translation" functions.
+ *
+ * @param result A string with the textual result received from the server,
+ * which can be plain-text or a serialized HTML doc.
+ */
+ parseResult(result) {
+ if (this.isSimpleRoot) {
+ this.translation = [result];
+ return;
+ }
+
+ let domParser = new DOMParser();
+
+ let doc = domParser.parseFromString(result, "text/html");
+ parseResultNode(this, doc.body.firstChild);
+ },
+
+ /**
+ * This function finds a child TranslationItem
+ * with the given id.
+ * @param id The id to look for, in the format "n#"
+ * @returns A TranslationItem with the given id, or null if
+ * it was not found.
+ */
+ getChildById(id) {
+ for (let child of this.children) {
+ if ("n" + child.id == id) {
+ return child;
+ }
+ }
+ return null;
+ },
+
+ /**
+ * Swap the text of this TranslationItem between
+ * its original and translated states.
+ *
+ * @param target A string that is either "translation"
+ * or "original".
+ */
+ swapText(target) {
+ swapTextForItem(this, target);
+ },
+};
+
+/**
+ * This object represents a placeholder item for translation. It's similar to
+ * the TranslationItem class, but it represents nodes that have no meaningful
+ * content for translation. These nodes will be replaced by "<br>" in a
+ * translation request. It's necessary to keep them to use it as a mark
+ * for correct positioning and spliting of text nodes.
+ */
+const TranslationItem_NodePlaceholder = {
+ toString() {
+ return "[object TranslationItem_NodePlaceholder]";
+ },
+};
+
+/**
+ * Generate the outer HTML representation for a given item.
+ *
+ * @param item A TranslationItem object.
+ * param content The inner content for this item.
+ * @returns string The outer HTML needed for translation
+ * of this item.
+ */
+function generateTranslationHtmlForItem(item, content) {
+ let localName = item.isRoot ? "div" : "b";
+ return (
+ "<" + localName + " id=n" + item.id + ">" + content + "</" + localName + ">"
+ );
+}
+
+/**
+ * Regenerate the text string that represents a TranslationItem object,
+ * with data from its "original" array. The array must have already
+ * been created by TranslationDocument.generateTextForItem().
+ *
+ * @param item A TranslationItem object
+ *
+ * @returns A string representation of the TranslationItem.
+ */
+function regenerateTextFromOriginalHelper(item) {
+ if (item.isSimpleRoot) {
+ return item.original[0];
+ }
+
+ let str = "";
+ for (let child of item.original) {
+ if (child instanceof TranslationItem) {
+ str += regenerateTextFromOriginalHelper(child);
+ } else if (child === TranslationItem_NodePlaceholder) {
+ str += "<br>";
+ } else {
+ str += child;
+ }
+ }
+
+ return generateTranslationHtmlForItem(item, str);
+}
+
+/**
+ * Helper function to parse a HTML doc result.
+ * How it works:
+ *
+ * An example result string is:
+ *
+ * <div id="n1">Hello <b id="n2">World</b> of Mozilla.</div>
+ *
+ * For an element node, we look at its id and find the corresponding
+ * TranslationItem that was associated with this node, and then we
+ * walk down it repeating the process.
+ *
+ * For text nodes we simply add it as a string.
+ */
+function parseResultNode(item, node) {
+ item.translation = [];
+ for (let child of node.childNodes) {
+ if (child.nodeType == child.TEXT_NODE) {
+ item.translation.push(child.nodeValue);
+ } else if (child.localName == "br") {
+ item.translation.push(TranslationItem_NodePlaceholder);
+ } else {
+ let translationItemChild = item.getChildById(child.id);
+
+ if (translationItemChild) {
+ item.translation.push(translationItemChild);
+ parseResultNode(translationItemChild, child);
+ }
+ }
+ }
+}
+
+/**
+ * Helper function to swap the text of a TranslationItem
+ * between its original and translated states.
+ * How it works:
+ *
+ * The function iterates through the target array (either the `original` or
+ * `translation` array from the TranslationItem), while also keeping a pointer
+ * to a current position in the child nodes from the actual DOM node that we
+ * are modifying. This pointer is moved forward after each item of the array
+ * is translated. If, at any given time, the pointer doesn't match the expected
+ * node that was supposed to be seen, it means that the original and translated
+ * contents have a different ordering, and thus we need to adjust that.
+ *
+ * A full example of the reordering process, swapping from Original to
+ * Translation:
+ *
+ * Original (en): <div>I <em>miss</em> <b>you</b></div>
+ *
+ * Translation (fr): <div><b>Tu</b> me <em>manques</em></div>
+ *
+ * Step 1:
+ * pointer points to firstChild of the DOM node, textnode "I "
+ * first item in item.translation is [object TranslationItem <b>]
+ *
+ * pointer does not match the expected element, <b>. So let's move <b> to the
+ * pointer position.
+ *
+ * Current state of the DOM:
+ * <div><b>you</b>I <em>miss</em> </div>
+ *
+ * Step 2:
+ * pointer moves forward to nextSibling, textnode "I " again.
+ * second item in item.translation is the string " me "
+ *
+ * pointer points to a text node, and we were expecting a text node. Match!
+ * just replace the text content.
+ *
+ * Current state of the DOM:
+ * <div><b>you</b> me <em>miss</em> </div>
+ *
+ * Step 3:
+ * pointer moves forward to nextSibling, <em>miss</em>
+ * third item in item.translation is [object TranslationItem <em>]
+ *
+ * pointer points to the expected node. Match! Nothing to do.
+ *
+ * Step 4:
+ * all items in this item.translation were transformed. The remaining
+ * text nodes are cleared to "", and domNode.normalize() removes them.
+ *
+ * Current state of the DOM:
+ * <div><b>you</b> me <em>miss</em></div>
+ *
+ * Further steps:
+ * After that, the function will visit the child items (from the visitStack),
+ * and the text inside the <b> and <em> nodes will be swapped as well,
+ * yielding the final result:
+ *
+ * <div><b>Tu</b> me <em>manques</em></div>
+ *
+ *
+ * @param item A TranslationItem object
+ * @param target A string that is either "translation"
+ * or "original".
+ */
+function swapTextForItem(item, target) {
+ // visitStack is the stack of items that we still need to visit.
+ // Let's start the process by adding the root item.
+ let visitStack = [item];
+
+ while (visitStack.length) {
+ let curItem = visitStack.shift();
+
+ let domNode = curItem.nodeRef;
+ if (!domNode) {
+ // Skipping this item due to a missing node.
+ continue;
+ }
+
+ if (!curItem[target]) {
+ // Translation not found for this item. This could be due to
+ // an error in the server response. For example, if a translation
+ // was broken in various chunks, and one of the chunks failed,
+ // the items from that chunk will be missing its "translation"
+ // field.
+ continue;
+ }
+
+ domNode.normalize();
+
+ // curNode points to the child nodes of the DOM node that we are
+ // modifying. During most of the process, while the target array is
+ // being iterated (in the for loop below), it should walk together with
+ // the array and be pointing to the correct node that needs to modified.
+ // If it's not pointing to it, that means some sort of node reordering
+ // will be necessary to produce the correct translation.
+ // Note that text nodes don't need to be reordered, as we can just replace
+ // the content of one text node with another.
+ //
+ // curNode starts in the firstChild...
+ let curNode = domNode.firstChild;
+
+ // ... actually, let's make curNode start at the first useful node (either
+ // a non-blank text node or something else). This is not strictly necessary,
+ // as the reordering algorithm would correctly handle this case. However,
+ // this better aligns the resulting translation with the DOM content of the
+ // page, avoiding cases that would need to be unecessarily reordered.
+ //
+ // An example of how this helps:
+ //
+ // ---- Original: <div> <b>Hello </b> world.</div>
+ // ^textnode 1 ^item 1 ^textnode 2
+ //
+ // - Translation: <div><b>Hallo </b> Welt.</div>
+ //
+ // Transformation process without this optimization:
+ // 1 - start pointer at textnode 1
+ // 2 - move item 1 to first position inside the <div>
+ //
+ // Node now looks like: <div><b>Hello </b>[ ][ world.]</div>
+ // textnode 1^ ^textnode 2
+ //
+ // 3 - replace textnode 1 with " Welt."
+ // 4 - clear remaining text nodes (in this case, textnode 2)
+ //
+ // Transformation process with this optimization:
+ // 1 - start pointer at item 1
+ // 2 - item 1 is already in position
+ // 3 - replace textnode 2 with " Welt."
+ //
+ // which completely avoids any node reordering, and requires only one
+ // text change instead of two (while also leaving the page closer to
+ // its original state).
+ while (
+ curNode &&
+ curNode.nodeType == curNode.TEXT_NODE &&
+ curNode.nodeValue.trim() == ""
+ ) {
+ curNode = curNode.nextSibling;
+ }
+
+ // Now let's walk through all items in the `target` array of the
+ // TranslationItem. This means either the TranslationItem.original or
+ // TranslationItem.translation array.
+ for (let targetItem of curItem[target]) {
+ if (targetItem instanceof TranslationItem) {
+ // If the array element is another TranslationItem object, let's
+ // add it to the stack to be visited.
+ visitStack.push(targetItem);
+
+ let targetNode = targetItem.nodeRef;
+
+ // If the node is not in the expected position, let's reorder
+ // it into position...
+ if (
+ curNode != targetNode &&
+ // ...unless the page has reparented this node under a totally
+ // different node (or removed it). In this case, all bets are off
+ // on being able to do anything correctly, so it's better not to
+ // bring back the node to this parent.
+ targetNode.parentNode == domNode
+ ) {
+ // We don't need to null-check curNode because insertBefore(..., null)
+ // does what we need in that case: reorder this node to the end
+ // of child nodes.
+ domNode.insertBefore(targetNode, curNode);
+ curNode = targetNode;
+ }
+
+ // Move pointer forward. Since we do not add empty text nodes to the
+ // list of translation items, we must skip them here too while
+ // traversing the DOM in order to get better alignment between the
+ // text nodes and the translation items.
+ if (curNode) {
+ curNode = getNextSiblingSkippingEmptyTextNodes(curNode);
+ }
+ } else if (targetItem === TranslationItem_NodePlaceholder) {
+ // If the current item is a placeholder node, we need to move
+ // our pointer "past" it, jumping from one side of a block of
+ // elements + empty text nodes to the other side. Even if
+ // non-placeholder elements exists inside the jumped block,
+ // they will be pulled correctly later in the process when the
+ // targetItem for those nodes are handled.
+
+ while (
+ curNode &&
+ (curNode.nodeType != curNode.TEXT_NODE ||
+ curNode.nodeValue.trim() == "")
+ ) {
+ curNode = curNode.nextSibling;
+ }
+ } else {
+ // Finally, if it's a text item, we just need to find the next
+ // text node to use. Text nodes don't need to be reordered, so
+ // the first one found can be used.
+ while (curNode && curNode.nodeType != curNode.TEXT_NODE) {
+ curNode = curNode.nextSibling;
+ }
+
+ // If none was found and we reached the end of the child nodes,
+ // let's create a new one.
+ if (!curNode) {
+ // We don't know if the original content had a space or not,
+ // so the best bet is to create the text node with " " which
+ // will add one space at the beginning and one at the end.
+ curNode = domNode.appendChild(
+ domNode.ownerDocument.createTextNode(" ")
+ );
+ }
+
+ // A trailing and a leading space must be preserved because
+ // they are meaningful in HTML.
+ let preSpace = /^\s/.test(curNode.nodeValue) ? " " : "";
+ let endSpace = /\s$/.test(curNode.nodeValue) ? " " : "";
+
+ curNode.nodeValue = preSpace + targetItem + endSpace;
+ curNode = getNextSiblingSkippingEmptyTextNodes(curNode);
+ }
+ }
+
+ // The translated version of a node might have less text nodes than its
+ // original version. If that's the case, let's clear the remaining nodes.
+ if (curNode) {
+ clearRemainingNonEmptyTextNodesFromElement(curNode);
+ }
+
+ // And remove any garbage "" nodes left after clearing.
+ domNode.normalize();
+ }
+}
+
+function getNextSiblingSkippingEmptyTextNodes(startSibling) {
+ let item = startSibling.nextSibling;
+ while (
+ item &&
+ item.nodeType == item.TEXT_NODE &&
+ item.nodeValue.trim() == ""
+ ) {
+ item = item.nextSibling;
+ }
+ return item;
+}
+
+function clearRemainingNonEmptyTextNodesFromElement(startSibling) {
+ let item = startSibling;
+ while (item) {
+ if (item.nodeType == item.TEXT_NODE && item.nodeValue != "") {
+ item.nodeValue = "";
+ }
+ item = item.nextSibling;
+ }
+}