summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader/ReaderMode.sys.mjs
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/reader/ReaderMode.sys.mjs')
-rw-r--r--toolkit/components/reader/ReaderMode.sys.mjs561
1 files changed, 561 insertions, 0 deletions
diff --git a/toolkit/components/reader/ReaderMode.sys.mjs b/toolkit/components/reader/ReaderMode.sys.mjs
new file mode 100644
index 0000000000..92fbcce367
--- /dev/null
+++ b/toolkit/components/reader/ReaderMode.sys.mjs
@@ -0,0 +1,561 @@
+// -*- indent-tabs-mode: nil; js-indent-level: 2 -*-
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// Constants for telemetry.
+const DOWNLOAD_SUCCESS = 0;
+const DOWNLOAD_ERROR_XHR = 1;
+const DOWNLOAD_ERROR_NO_DOC = 2;
+
+const PARSE_SUCCESS = 0;
+const PARSE_ERROR_TOO_MANY_ELEMENTS = 1;
+const PARSE_ERROR_WORKER = 2;
+const PARSE_ERROR_NO_ARTICLE = 3;
+
+// Class names to preserve in the readerized output. We preserve these class
+// names so that rules in aboutReader.css can match them.
+const CLASSES_TO_PRESERVE = [
+ "caption",
+ "emoji",
+ "hidden",
+ "invisible",
+ "sr-only",
+ "visually-hidden",
+ "visuallyhidden",
+ "wp-caption",
+ "wp-caption-text",
+ "wp-smiley",
+];
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ LanguageDetector:
+ "resource://gre/modules/translation/LanguageDetector.sys.mjs",
+ ReaderWorker: "resource://gre/modules/reader/ReaderWorker.sys.mjs",
+ Readerable: "resource://gre/modules/Readerable.sys.mjs",
+});
+
+const gIsFirefoxDesktop =
+ Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}";
+
+Services.telemetry.setEventRecordingEnabled("readermode", true);
+
+export var ReaderMode = {
+ DEBUG: 0,
+
+ // For time spent telemetry
+ enterTime: undefined,
+ leaveTime: undefined,
+
+ /**
+ * Enter the reader mode by going forward one step in history if applicable,
+ * if not, append the about:reader page in the history instead.
+ */
+ enterReaderMode(docShell, win) {
+ this.enterTime = Date.now();
+
+ Services.telemetry.recordEvent("readermode", "view", "on", null, {
+ subcategory: "feature",
+ });
+
+ let url = win.document.location.href;
+ let readerURL = "about:reader?url=" + encodeURIComponent(url);
+
+ if (!Services.appinfo.sessionHistoryInParent) {
+ let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
+ let sh = webNav.sessionHistory;
+ if (webNav.canGoForward) {
+ let forwardEntry = sh.legacySHistory.getEntryAtIndex(sh.index + 1);
+ let forwardURL = forwardEntry.URI.spec;
+ if (forwardURL && (forwardURL == readerURL || !readerURL)) {
+ webNav.goForward();
+ return;
+ }
+ }
+ }
+
+ // This could possibly move to the parent. See bug 1664982.
+ win.document.location = readerURL;
+ },
+
+ /**
+ * Exit the reader mode by going back one step in history if applicable,
+ * if not, append the original page in the history instead.
+ */
+ leaveReaderMode(docShell, win) {
+ this.leaveTime = Date.now();
+
+ // Measured in seconds (whole number)
+ let timeSpentInReaderMode = Math.floor(
+ (this.leaveTime - this.enterTime) / 1000
+ );
+
+ // Measured as percentage (whole number)
+ let scrollPosition = Math.floor(
+ ((win.scrollY + win.innerHeight) / win.document.body.clientHeight) * 100
+ );
+
+ Services.telemetry.recordEvent("readermode", "view", "off", null, {
+ subcategory: "feature",
+ reader_time: `${timeSpentInReaderMode}`,
+ scroll_position: `${scrollPosition}`,
+ });
+
+ let url = win.document.location.href;
+ let originalURL = this.getOriginalUrl(url);
+ let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
+
+ if (!Services.appinfo.sessionHistoryInParent) {
+ let sh = webNav.sessionHistory;
+ if (webNav.canGoBack) {
+ let prevEntry = sh.legacySHistory.getEntryAtIndex(sh.index - 1);
+ let prevURL = prevEntry.URI.spec;
+ if (prevURL && (prevURL == originalURL || !originalURL)) {
+ webNav.goBack();
+ return;
+ }
+ }
+ }
+
+ let referrerURI, principal;
+ try {
+ referrerURI = Services.io.newURI(url);
+ principal = Services.scriptSecurityManager.createContentPrincipal(
+ referrerURI,
+ win.document.nodePrincipal.originAttributes
+ );
+ } catch (e) {
+ console.error(e);
+ return;
+ }
+ let loadFlags = webNav.LOAD_FLAGS_DISALLOW_INHERIT_PRINCIPAL;
+ let ReferrerInfo = Components.Constructor(
+ "@mozilla.org/referrer-info;1",
+ "nsIReferrerInfo",
+ "init"
+ );
+ let loadURIOptions = {
+ triggeringPrincipal: principal,
+ loadFlags,
+ referrerInfo: new ReferrerInfo(
+ Ci.nsIReferrerInfo.EMPTY,
+ true,
+ referrerURI
+ ),
+ };
+ // This could possibly move to the parent. See bug 1664982.
+ webNav.fixupAndLoadURIString(originalURL, loadURIOptions);
+ },
+
+ /**
+ * Returns original URL from an about:reader URL.
+ *
+ * @param url An about:reader URL.
+ * @return The original URL for the article, or null if we did not find
+ * a properly formatted about:reader URL.
+ */
+ getOriginalUrl(url) {
+ if (!url.startsWith("about:reader?")) {
+ return null;
+ }
+
+ let outerHash = "";
+ try {
+ let uriObj = Services.io.newURI(url);
+ url = uriObj.specIgnoringRef;
+ outerHash = uriObj.ref;
+ } catch (ex) {
+ /* ignore, use the raw string */
+ }
+
+ let searchParams = new URLSearchParams(
+ url.substring("about:reader?".length)
+ );
+ if (!searchParams.has("url")) {
+ return null;
+ }
+ let originalUrl = searchParams.get("url");
+ if (outerHash) {
+ try {
+ let uriObj = Services.io.newURI(originalUrl);
+ uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
+ originalUrl = uriObj.spec;
+ } catch (ex) {}
+ }
+ return originalUrl;
+ },
+
+ getOriginalUrlObjectForDisplay(url) {
+ let originalUrl = this.getOriginalUrl(url);
+ if (originalUrl) {
+ let uriObj;
+ try {
+ uriObj = Services.uriFixup.getFixupURIInfo(originalUrl).preferredURI;
+ } catch (ex) {
+ return null;
+ }
+ try {
+ return Services.io.createExposableURI(uriObj);
+ } catch (ex) {
+ return null;
+ }
+ }
+ return null;
+ },
+
+ /**
+ * Gets an article from a loaded browser's document. This method will not attempt
+ * to parse certain URIs (e.g. about: URIs).
+ *
+ * @param doc A document to parse.
+ * @return {Promise}
+ * @resolves JS object representing the article, or null if no article is found.
+ */
+ parseDocument(doc) {
+ if (
+ !lazy.Readerable.shouldCheckUri(doc.documentURIObject) ||
+ !lazy.Readerable.shouldCheckUri(doc.baseURIObject, true)
+ ) {
+ this.log("Reader mode disabled for URI");
+ return null;
+ }
+
+ return this._readerParse(doc);
+ },
+
+ /**
+ * Downloads and parses a document from a URL.
+ *
+ * @param url URL to download and parse.
+ * @return {Promise}
+ * @resolves JS object representing the article, or null if no article is found.
+ */
+ async downloadAndParseDocument(url, docContentType = "document") {
+ let result = await this._downloadDocument(url, docContentType);
+ if (!result?.doc) {
+ return null;
+ }
+ let { doc, newURL } = result;
+ if (
+ !lazy.Readerable.shouldCheckUri(doc.documentURIObject) ||
+ !lazy.Readerable.shouldCheckUri(doc.baseURIObject, true)
+ ) {
+ this.log("Reader mode disabled for URI");
+ return null;
+ }
+
+ let article = await this._readerParse(doc);
+ // If we have to redirect, reject to the caller with the parsed article,
+ // so we can update the URL before displaying it.
+ if (newURL) {
+ return Promise.reject({ newURL, article });
+ }
+ // Otherwise, we can just continue with the article.
+ return article;
+ },
+
+ _downloadDocument(url, docContentType = "document") {
+ try {
+ if (!lazy.Readerable.shouldCheckUri(Services.io.newURI(url))) {
+ return null;
+ }
+ } catch (ex) {
+ console.error(
+ new Error(`Couldn't create URI from ${url} to download: ${ex}`)
+ );
+ return null;
+ }
+ let histogram = Services.telemetry.getHistogramById(
+ "READER_MODE_DOWNLOAD_RESULT"
+ );
+ return new Promise((resolve, reject) => {
+ let xhr = new XMLHttpRequest();
+ xhr.open("GET", url, true);
+ xhr.onerror = evt => reject(evt.error);
+ xhr.responseType = docContentType === "text/plain" ? "text" : "document";
+ xhr.onload = evt => {
+ if (xhr.status !== 200) {
+ reject("Reader mode XHR failed with status: " + xhr.status);
+ histogram.add(DOWNLOAD_ERROR_XHR);
+ return;
+ }
+
+ let doc =
+ xhr.responseType === "text" ? xhr.responseText : xhr.responseXML;
+ if (!doc) {
+ reject("Reader mode XHR didn't return a document");
+ histogram.add(DOWNLOAD_ERROR_NO_DOC);
+ return;
+ }
+
+ let responseURL = xhr.responseURL;
+ let givenURL = url;
+ // Convert these to real URIs to make sure the escaping (or lack
+ // thereof) is identical:
+ try {
+ responseURL = Services.io.newURI(responseURL).specIgnoringRef;
+ } catch (ex) {
+ /* Ignore errors - we'll use what we had before */
+ }
+ try {
+ givenURL = Services.io.newURI(givenURL).specIgnoringRef;
+ } catch (ex) {
+ /* Ignore errors - we'll use what we had before */
+ }
+
+ if (xhr.responseType != "document") {
+ let initialText = doc;
+ let parser = new DOMParser();
+ doc = parser.parseFromString(`<pre></pre>`, "text/html");
+ doc.querySelector("pre").textContent = initialText;
+ }
+
+ // We treat redirects as download successes here:
+ histogram.add(DOWNLOAD_SUCCESS);
+
+ let result = { doc };
+ if (responseURL != givenURL) {
+ result.newURL = xhr.responseURL;
+ }
+
+ resolve(result);
+ };
+ xhr.send();
+ });
+ },
+
+ log(msg) {
+ if (this.DEBUG) {
+ dump("Reader: " + msg);
+ }
+ },
+
+ /**
+ * Attempts to parse a document into an article. Heavy lifting happens
+ * in readerWorker.js.
+ *
+ * @param doc The document to parse.
+ * @return {Promise}
+ * @resolves JS object representing the article, or null if no article is found.
+ */
+ async _readerParse(doc) {
+ let histogram = Services.telemetry.getHistogramById(
+ "READER_MODE_PARSE_RESULT"
+ );
+ if (this.parseNodeLimit) {
+ let numTags = doc.getElementsByTagName("*").length;
+ if (numTags > this.parseNodeLimit) {
+ this.log(
+ "Aborting parse for " +
+ doc.baseURIObject.spec +
+ "; " +
+ numTags +
+ " elements found"
+ );
+ histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS);
+ return null;
+ }
+ }
+
+ // Fetch this here before we send `doc` off to the worker thread, as later on the
+ // document might be nuked but we will still want the URI.
+ let { documentURI } = doc;
+
+ let uriParam;
+ uriParam = {
+ spec: doc.baseURIObject.spec,
+ prePath: doc.baseURIObject.prePath,
+ scheme: doc.baseURIObject.scheme,
+
+ // Fallback
+ host: documentURI,
+ pathBase: documentURI,
+ };
+
+ // nsIURI.host throws an exception if a host doesn't exist.
+ try {
+ uriParam.host = doc.baseURIObject.host;
+ uriParam.pathBase = Services.io.newURI(".", null, doc.baseURIObject).spec;
+ } catch (ex) {
+ // Fall back to the initial values we assigned.
+ console.warn("Error accessing host name: ", ex);
+ }
+
+ // convert text/plain document, if any, to XHTML format
+ if (this._isDocumentPlainText(doc)) {
+ doc = this._convertPlainTextDocument(doc);
+ }
+
+ let serializer = new XMLSerializer();
+ let serializedDoc = serializer.serializeToString(doc);
+ // Explicitly null out doc to make it clear it might not be available from this
+ // point on.
+ doc = null;
+
+ let options = {
+ classesToPreserve: CLASSES_TO_PRESERVE,
+ };
+
+ let article = null;
+ try {
+ article = await lazy.ReaderWorker.post("parseDocument", [
+ uriParam,
+ serializedDoc,
+ options,
+ ]);
+ } catch (e) {
+ console.error("Error in ReaderWorker: ", e);
+ histogram.add(PARSE_ERROR_WORKER);
+ }
+
+ if (!article) {
+ this.log("Worker did not return an article");
+ histogram.add(PARSE_ERROR_NO_ARTICLE);
+ return null;
+ }
+
+ // Readability returns a URI object based on the baseURI, but we only care
+ // about the original document's URL from now on. This also avoids spoofing
+ // attempts where the baseURI doesn't match the domain of the documentURI
+ article.url = documentURI;
+ delete article.uri;
+
+ let flags =
+ Ci.nsIDocumentEncoder.OutputSelectionOnly |
+ Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
+ article.title = Cc["@mozilla.org/parserutils;1"]
+ .getService(Ci.nsIParserUtils)
+ .convertToPlainText(article.title, flags, 0);
+ if (gIsFirefoxDesktop) {
+ await this._assignLanguage(article);
+ this._maybeAssignTextDirection(article);
+ }
+
+ this._assignReadTime(article);
+
+ histogram.add(PARSE_SUCCESS);
+ return article;
+ },
+
+ /**
+ * Sets a global language string value if the result is confident
+ *
+ * @return Promise
+ * @resolves when the language is detected
+ */
+ _assignLanguage(article) {
+ return lazy.LanguageDetector.detectLanguage(article.textContent).then(
+ result => {
+ article.language = result.confident ? result.language : null;
+ }
+ );
+ },
+
+ _maybeAssignTextDirection(article) {
+ // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
+ if (
+ !article.dir &&
+ ["ar", "fa", "he", "ug", "ur"].includes(article.language)
+ ) {
+ article.dir = "rtl";
+ }
+ },
+
+ /**
+ * Assigns the estimated reading time range of the article to the article object.
+ *
+ * @param article the article object to assign the reading time estimate to.
+ */
+ _assignReadTime(article) {
+ let lang = article.language || "en";
+ const readingSpeed = this._getReadingSpeedForLanguage(lang);
+ const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
+ const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
+ const length = article.length;
+
+ article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
+ article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh);
+ },
+
+ /**
+ * Returns the reading speed of a selection of languages with likely variance.
+ *
+ * Reading speed estimated from a study done on reading speeds in various languages.
+ * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
+ *
+ * @return object with characters per minute and variance. Defaults to English
+ * if no suitable language is found in the collection.
+ */
+ _getReadingSpeedForLanguage(lang) {
+ const readingSpeed = new Map([
+ ["en", { cpm: 987, variance: 118 }],
+ ["ar", { cpm: 612, variance: 88 }],
+ ["de", { cpm: 920, variance: 86 }],
+ ["es", { cpm: 1025, variance: 127 }],
+ ["fi", { cpm: 1078, variance: 121 }],
+ ["fr", { cpm: 998, variance: 126 }],
+ ["he", { cpm: 833, variance: 130 }],
+ ["it", { cpm: 950, variance: 140 }],
+ ["jw", { cpm: 357, variance: 56 }],
+ ["nl", { cpm: 978, variance: 143 }],
+ ["pl", { cpm: 916, variance: 126 }],
+ ["pt", { cpm: 913, variance: 145 }],
+ ["ru", { cpm: 986, variance: 175 }],
+ ["sk", { cpm: 885, variance: 145 }],
+ ["sv", { cpm: 917, variance: 156 }],
+ ["tr", { cpm: 1054, variance: 156 }],
+ ["zh", { cpm: 255, variance: 29 }],
+ ]);
+
+ return readingSpeed.get(lang) || readingSpeed.get("en");
+ },
+ /**
+ *
+ * Check if the document to be parsed is text document.
+ * @param doc the doc object to be parsed.
+ * @return boolean
+ *
+ */
+ _isDocumentPlainText(doc) {
+ return doc.contentType == "text/plain";
+ },
+ /**
+ *
+ * The document to be parsed is text document and is converted to HTML format.
+ * @param doc the doc object to be parsed.
+ * @return doc
+ *
+ */
+ _convertPlainTextDocument(doc) {
+ let preTag = doc.querySelector("pre");
+ let docFrag = doc.createDocumentFragment();
+ let content = preTag.textContent;
+ let paragraphs = content.split(/\r?\n\r?\n/);
+ for (let para of paragraphs) {
+ let pElem = doc.createElement("p");
+ let lines = para.split(/\n/);
+ for (let line of lines) {
+ pElem.append(line);
+ let brElem = doc.createElement("br");
+ pElem.append(brElem);
+ }
+ docFrag.append(pElem);
+ }
+ // Clone the document to avoid the original document being affected
+ // (which shows up when exiting reader mode again).
+ let clone = doc.documentElement.cloneNode(true);
+ clone.querySelector("pre").replaceWith(docFrag);
+ return clone;
+ },
+};
+
+XPCOMUtils.defineLazyPreferenceGetter(
+ ReaderMode,
+ "maxElemsToParse",
+ "reader.parse-node-limit",
+ 0
+);