1 files changed, 561 insertions, 0 deletions
diff --git a/toolkit/components/reader/ReaderMode.sys.mjs b/toolkit/components/reader/ReaderMode.sys.mjs
new file mode 100644
index 0000000000..92fbcce367
--- /dev/null
+++ b/toolkit/components/reader/ReaderMode.sys.mjs
@@ -0,0 +1,561 @@
+// -*- indent-tabs-mode: nil; js-indent-level: 2 -*-
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// Constants for telemetry.
+const DOWNLOAD_SUCCESS = 0;
+const DOWNLOAD_ERROR_XHR = 1;
+const DOWNLOAD_ERROR_NO_DOC = 2;
+
+const PARSE_SUCCESS = 0;
+const PARSE_ERROR_TOO_MANY_ELEMENTS = 1;
+const PARSE_ERROR_WORKER = 2;
+const PARSE_ERROR_NO_ARTICLE = 3;
+
+// Class names to preserve in the readerized output. We preserve these class
+// names so that rules in aboutReader.css can match them.
+const CLASSES_TO_PRESERVE = [
+  "caption",
+  "emoji",
+  "hidden",
+  "invisible",
+  "sr-only",
+  "visually-hidden",
+  "visuallyhidden",
+  "wp-caption",
+  "wp-caption-text",
+  "wp-smiley",
+];
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  LanguageDetector:
+    "resource://gre/modules/translation/LanguageDetector.sys.mjs",
+  ReaderWorker: "resource://gre/modules/reader/ReaderWorker.sys.mjs",
+  Readerable: "resource://gre/modules/Readerable.sys.mjs",
+});
+
+const gIsFirefoxDesktop =
+  Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}";
+
+Services.telemetry.setEventRecordingEnabled("readermode", true);
+
+export var ReaderMode = {
+  DEBUG: 0,
+
+  // For time spent telemetry
+  enterTime: undefined,
+  leaveTime: undefined,
+
+  /**
+   * Enter the reader mode by going forward one step in history if applicable,
+   * if not, append the about:reader page in the history instead.
+   */
+  enterReaderMode(docShell, win) {
+    this.enterTime = Date.now();
+
+    Services.telemetry.recordEvent("readermode", "view", "on", null, {
+      subcategory: "feature",
+    });
+
+    let url = win.document.location.href;
+    let readerURL = "about:reader?url=" + encodeURIComponent(url);
+
+    if (!Services.appinfo.sessionHistoryInParent) {
+      let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
+      let sh = webNav.sessionHistory;
+      if (webNav.canGoForward) {
+        let forwardEntry = sh.legacySHistory.getEntryAtIndex(sh.index + 1);
+        let forwardURL = forwardEntry.URI.spec;
+        if (forwardURL && (forwardURL == readerURL || !readerURL)) {
+          webNav.goForward();
+          return;
+        }
+      }
+    }
+
+    // This could possibly move to the parent. See bug 1664982.
+    win.document.location = readerURL;
+  },
+
+  /**
+   * Exit the reader mode by going back one step in history if applicable,
+   * if not, append the original page in the history instead.
+   */
+  leaveReaderMode(docShell, win) {
+    this.leaveTime = Date.now();
+
+    // Measured in seconds (whole number)
+    let timeSpentInReaderMode = Math.floor(
+      (this.leaveTime - this.enterTime) / 1000
+    );
+
+    // Measured as percentage (whole number)
+    let scrollPosition = Math.floor(
+      ((win.scrollY + win.innerHeight) / win.document.body.clientHeight) * 100
+    );
+
+    Services.telemetry.recordEvent("readermode", "view", "off", null, {
+      subcategory: "feature",
+      reader_time: `${timeSpentInReaderMode}`,
+      scroll_position: `${scrollPosition}`,
+    });
+
+    let url = win.document.location.href;
+    let originalURL = this.getOriginalUrl(url);
+    let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
+
+    if (!Services.appinfo.sessionHistoryInParent) {
+      let sh = webNav.sessionHistory;
+      if (webNav.canGoBack) {
+        let prevEntry = sh.legacySHistory.getEntryAtIndex(sh.index - 1);
+        let prevURL = prevEntry.URI.spec;
+        if (prevURL && (prevURL == originalURL || !originalURL)) {
+          webNav.goBack();
+          return;
+        }
+      }
+    }
+
+    let referrerURI, principal;
+    try {
+      referrerURI = Services.io.newURI(url);
+      principal = Services.scriptSecurityManager.createContentPrincipal(
+        referrerURI,
+        win.document.nodePrincipal.originAttributes
+      );
+    } catch (e) {
+      console.error(e);
+      return;
+    }
+    let loadFlags = webNav.LOAD_FLAGS_DISALLOW_INHERIT_PRINCIPAL;
+    let ReferrerInfo = Components.Constructor(
+      "@mozilla.org/referrer-info;1",
+      "nsIReferrerInfo",
+      "init"
+    );
+    let loadURIOptions = {
+      triggeringPrincipal: principal,
+      loadFlags,
+      referrerInfo: new ReferrerInfo(
+        Ci.nsIReferrerInfo.EMPTY,
+        true,
+        referrerURI
+      ),
+    };
+    // This could possibly move to the parent. See bug 1664982.
+    webNav.fixupAndLoadURIString(originalURL, loadURIOptions);
+  },
+
+  /**
+   * Returns original URL from an about:reader URL.
+   *
+   * @param url An about:reader URL.
+   * @return The original URL for the article, or null if we did not find
+   *         a properly formatted about:reader URL.
+   */
+  getOriginalUrl(url) {
+    if (!url.startsWith("about:reader?")) {
+      return null;
+    }
+
+    let outerHash = "";
+    try {
+      let uriObj = Services.io.newURI(url);
+      url = uriObj.specIgnoringRef;
+      outerHash = uriObj.ref;
+    } catch (ex) {
+      /* ignore, use the raw string */
+    }
+
+    let searchParams = new URLSearchParams(
+      url.substring("about:reader?".length)
+    );
+    if (!searchParams.has("url")) {
+      return null;
+    }
+    let originalUrl = searchParams.get("url");
+    if (outerHash) {
+      try {
+        let uriObj = Services.io.newURI(originalUrl);
+        uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
+        originalUrl = uriObj.spec;
+      } catch (ex) {}
+    }
+    return originalUrl;
+  },
+
+  getOriginalUrlObjectForDisplay(url) {
+    let originalUrl = this.getOriginalUrl(url);
+    if (originalUrl) {
+      let uriObj;
+      try {
+        uriObj = Services.uriFixup.getFixupURIInfo(originalUrl).preferredURI;
+      } catch (ex) {
+        return null;
+      }
+      try {
+        return Services.io.createExposableURI(uriObj);
+      } catch (ex) {
+        return null;
+      }
+    }
+    return null;
+  },
+
+  /**
+   * Gets an article from a loaded browser's document. This method will not attempt
+   * to parse certain URIs (e.g. about: URIs).
+   *
+   * @param doc A document to parse.
+   * @return {Promise}
+   * @resolves JS object representing the article, or null if no article is found.
+   */
+  parseDocument(doc) {
+    if (
+      !lazy.Readerable.shouldCheckUri(doc.documentURIObject) ||
+      !lazy.Readerable.shouldCheckUri(doc.baseURIObject, true)
+    ) {
+      this.log("Reader mode disabled for URI");
+      return null;
+    }
+
+    return this._readerParse(doc);
+  },
+
+  /**
+   * Downloads and parses a document from a URL.
+   *
+   * @param url URL to download and parse.
+   * @return {Promise}
+   * @resolves JS object representing the article, or null if no article is found.
+   */
+  async downloadAndParseDocument(url, docContentType = "document") {
+    let result = await this._downloadDocument(url, docContentType);
+    if (!result?.doc) {
+      return null;
+    }
+    let { doc, newURL } = result;
+    if (
+      !lazy.Readerable.shouldCheckUri(doc.documentURIObject) ||
+      !lazy.Readerable.shouldCheckUri(doc.baseURIObject, true)
+    ) {
+      this.log("Reader mode disabled for URI");
+      return null;
+    }
+
+    let article = await this._readerParse(doc);
+    // If we have to redirect, reject to the caller with the parsed article,
+    // so we can update the URL before displaying it.
+    if (newURL) {
+      return Promise.reject({ newURL, article });
+    }
+    // Otherwise, we can just continue with the article.
+    return article;
+  },
+
+  _downloadDocument(url, docContentType = "document") {
+    try {
+      if (!lazy.Readerable.shouldCheckUri(Services.io.newURI(url))) {
+        return null;
+      }
+    } catch (ex) {
+      console.error(
+        new Error(`Couldn't create URI from ${url} to download: ${ex}`)
+      );
+      return null;
+    }
+    let histogram = Services.telemetry.getHistogramById(
+      "READER_MODE_DOWNLOAD_RESULT"
+    );
+    return new Promise((resolve, reject) => {
+      let xhr = new XMLHttpRequest();
+      xhr.open("GET", url, true);
+      xhr.onerror = evt => reject(evt.error);
+      xhr.responseType = docContentType === "text/plain" ? "text" : "document";
+      xhr.onload = evt => {
+        if (xhr.status !== 200) {
+          reject("Reader mode XHR failed with status: " + xhr.status);
+          histogram.add(DOWNLOAD_ERROR_XHR);
+          return;
+        }
+
+        let doc =
+          xhr.responseType === "text" ? xhr.responseText : xhr.responseXML;
+        if (!doc) {
+          reject("Reader mode XHR didn't return a document");
+          histogram.add(DOWNLOAD_ERROR_NO_DOC);
+          return;
+        }
+
+        let responseURL = xhr.responseURL;
+        let givenURL = url;
+        // Convert these to real URIs to make sure the escaping (or lack
+        // thereof) is identical:
+        try {
+          responseURL = Services.io.newURI(responseURL).specIgnoringRef;
+        } catch (ex) {
+          /* Ignore errors - we'll use what we had before */
+        }
+        try {
+          givenURL = Services.io.newURI(givenURL).specIgnoringRef;
+        } catch (ex) {
+          /* Ignore errors - we'll use what we had before */
+        }
+
+        if (xhr.responseType != "document") {
+          let initialText = doc;
+          let parser = new DOMParser();
+          doc = parser.parseFromString(`<pre></pre>`, "text/html");
+          doc.querySelector("pre").textContent = initialText;
+        }
+
+        // We treat redirects as download successes here:
+        histogram.add(DOWNLOAD_SUCCESS);
+
+        let result = { doc };
+        if (responseURL != givenURL) {
+          result.newURL = xhr.responseURL;
+        }
+
+        resolve(result);
+      };
+      xhr.send();
+    });
+  },
+
+  log(msg) {
+    if (this.DEBUG) {
+      dump("Reader: " + msg);
+    }
+  },
+
+  /**
+   * Attempts to parse a document into an article. Heavy lifting happens
+   * in readerWorker.js.
+   *
+   * @param doc The document to parse.
+   * @return {Promise}
+   * @resolves JS object representing the article, or null if no article is found.
+   */
+  async _readerParse(doc) {
+    let histogram = Services.telemetry.getHistogramById(
+      "READER_MODE_PARSE_RESULT"
+    );
+    if (this.parseNodeLimit) {
+      let numTags = doc.getElementsByTagName("*").length;
+      if (numTags > this.parseNodeLimit) {
+        this.log(
+          "Aborting parse for " +
+            doc.baseURIObject.spec +
+            "; " +
+            numTags +
+            " elements found"
+        );
+        histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS);
+        return null;
+      }
+    }
+
+    // Fetch this here before we send `doc` off to the worker thread, as later on the
+    // document might be nuked but we will still want the URI.
+    let { documentURI } = doc;
+
+    let uriParam;
+    uriParam = {
+      spec: doc.baseURIObject.spec,
+      prePath: doc.baseURIObject.prePath,
+      scheme: doc.baseURIObject.scheme,
+
+      // Fallback
+      host: documentURI,
+      pathBase: documentURI,
+    };
+
+    // nsIURI.host throws an exception if a host doesn't exist.
+    try {
+      uriParam.host = doc.baseURIObject.host;
+      uriParam.pathBase = Services.io.newURI(".", null, doc.baseURIObject).spec;
+    } catch (ex) {
+      // Fall back to the initial values we assigned.
+      console.warn("Error accessing host name: ", ex);
+    }
+
+    // convert text/plain document, if any, to XHTML format
+    if (this._isDocumentPlainText(doc)) {
+      doc = this._convertPlainTextDocument(doc);
+    }
+
+    let serializer = new XMLSerializer();
+    let serializedDoc = serializer.serializeToString(doc);
+    // Explicitly null out doc to make it clear it might not be available from this
+    // point on.
+    doc = null;
+
+    let options = {
+      classesToPreserve: CLASSES_TO_PRESERVE,
+    };
+
+    let article = null;
+    try {
+      article = await lazy.ReaderWorker.post("parseDocument", [
+        uriParam,
+        serializedDoc,
+        options,
+      ]);
+    } catch (e) {
+      console.error("Error in ReaderWorker: ", e);
+      histogram.add(PARSE_ERROR_WORKER);
+    }
+
+    if (!article) {
+      this.log("Worker did not return an article");
+      histogram.add(PARSE_ERROR_NO_ARTICLE);
+      return null;
+    }
+
+    // Readability returns a URI object based on the baseURI, but we only care
+    // about the original document's URL from now on. This also avoids spoofing
+    // attempts where the baseURI doesn't match the domain of the documentURI
+    article.url = documentURI;
+    delete article.uri;
+
+    let flags =
+      Ci.nsIDocumentEncoder.OutputSelectionOnly |
+      Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
+    article.title = Cc["@mozilla.org/parserutils;1"]
+      .getService(Ci.nsIParserUtils)
+      .convertToPlainText(article.title, flags, 0);
+    if (gIsFirefoxDesktop) {
+      await this._assignLanguage(article);
+      this._maybeAssignTextDirection(article);
+    }
+
+    this._assignReadTime(article);
+
+    histogram.add(PARSE_SUCCESS);
+    return article;
+  },
+
+  /**
+   * Sets a global language string value if the result is confident
+   *
+   * @return Promise
+   * @resolves when the language is detected
+   */
+  _assignLanguage(article) {
+    return lazy.LanguageDetector.detectLanguage(article.textContent).then(
+      result => {
+        article.language = result.confident ? result.language : null;
+      }
+    );
+  },
+
+  _maybeAssignTextDirection(article) {
+    // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
+    if (
+      !article.dir &&
+      ["ar", "fa", "he", "ug", "ur"].includes(article.language)
+    ) {
+      article.dir = "rtl";
+    }
+  },
+
+  /**
+   * Assigns the estimated reading time range of the article to the article object.
+   *
+   * @param article the article object to assign the reading time estimate to.
+   */
+  _assignReadTime(article) {
+    let lang = article.language || "en";
+    const readingSpeed = this._getReadingSpeedForLanguage(lang);
+    const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
+    const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
+    const length = article.length;
+
+    article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
+    article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh);
+  },
+
+  /**
+   * Returns the reading speed of a selection of languages with likely variance.
+   *
+   * Reading speed estimated from a study done on reading speeds in various languages.
+   * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
+   *
+   * @return object with characters per minute and variance. Defaults to English
+   *         if no suitable language is found in the collection.
+   */
+  _getReadingSpeedForLanguage(lang) {
+    const readingSpeed = new Map([
+      ["en", { cpm: 987, variance: 118 }],
+      ["ar", { cpm: 612, variance: 88 }],
+      ["de", { cpm: 920, variance: 86 }],
+      ["es", { cpm: 1025, variance: 127 }],
+      ["fi", { cpm: 1078, variance: 121 }],
+      ["fr", { cpm: 998, variance: 126 }],
+      ["he", { cpm: 833, variance: 130 }],
+      ["it", { cpm: 950, variance: 140 }],
+      ["jw", { cpm: 357, variance: 56 }],
+      ["nl", { cpm: 978, variance: 143 }],
+      ["pl", { cpm: 916, variance: 126 }],
+      ["pt", { cpm: 913, variance: 145 }],
+      ["ru", { cpm: 986, variance: 175 }],
+      ["sk", { cpm: 885, variance: 145 }],
+      ["sv", { cpm: 917, variance: 156 }],
+      ["tr", { cpm: 1054, variance: 156 }],
+      ["zh", { cpm: 255, variance: 29 }],
+    ]);
+
+    return readingSpeed.get(lang) || readingSpeed.get("en");
+  },
+  /**
+   *
+   * Check if the document to be parsed is text document.
+   * @param doc the doc object to be parsed.
+   * @return boolean
+   *
+   */
+  _isDocumentPlainText(doc) {
+    return doc.contentType == "text/plain";
+  },
+  /**
+   *
+   * The document to be parsed is text document and is converted to HTML format.
+   * @param doc the doc object to be parsed.
+   * @return doc
+   *
+   */
+  _convertPlainTextDocument(doc) {
+    let preTag = doc.querySelector("pre");
+    let docFrag = doc.createDocumentFragment();
+    let content = preTag.textContent;
+    let paragraphs = content.split(/\r?\n\r?\n/);
+    for (let para of paragraphs) {
+      let pElem = doc.createElement("p");
+      let lines = para.split(/\n/);
+      for (let line of lines) {
+        pElem.append(line);
+        let brElem = doc.createElement("br");
+        pElem.append(brElem);
+      }
+      docFrag.append(pElem);
+    }
+    // Clone the document to avoid the original document being affected
+    // (which shows up when exiting reader mode again).
+    let clone = doc.documentElement.cloneNode(true);
+    clone.querySelector("pre").replaceWith(docFrag);
+    return clone;
+  },
+};
+
+XPCOMUtils.defineLazyPreferenceGetter(
+  ReaderMode,
+  "maxElemsToParse",
+  "reader.parse-node-limit",
+  0
+);