diff options
Diffstat (limited to 'toolkit/components/reader/ReaderMode.sys.mjs')
-rw-r--r-- | toolkit/components/reader/ReaderMode.sys.mjs | 561 |
1 files changed, 561 insertions, 0 deletions
diff --git a/toolkit/components/reader/ReaderMode.sys.mjs b/toolkit/components/reader/ReaderMode.sys.mjs new file mode 100644 index 0000000000..92fbcce367 --- /dev/null +++ b/toolkit/components/reader/ReaderMode.sys.mjs @@ -0,0 +1,561 @@ +// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Constants for telemetry. +const DOWNLOAD_SUCCESS = 0; +const DOWNLOAD_ERROR_XHR = 1; +const DOWNLOAD_ERROR_NO_DOC = 2; + +const PARSE_SUCCESS = 0; +const PARSE_ERROR_TOO_MANY_ELEMENTS = 1; +const PARSE_ERROR_WORKER = 2; +const PARSE_ERROR_NO_ARTICLE = 3; + +// Class names to preserve in the readerized output. We preserve these class +// names so that rules in aboutReader.css can match them. +const CLASSES_TO_PRESERVE = [ + "caption", + "emoji", + "hidden", + "invisible", + "sr-only", + "visually-hidden", + "visuallyhidden", + "wp-caption", + "wp-caption-text", + "wp-smiley", +]; + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + LanguageDetector: + "resource://gre/modules/translation/LanguageDetector.sys.mjs", + ReaderWorker: "resource://gre/modules/reader/ReaderWorker.sys.mjs", + Readerable: "resource://gre/modules/Readerable.sys.mjs", +}); + +const gIsFirefoxDesktop = + Services.appinfo.ID == "{ec8030f7-c20a-464f-9b0e-13a3a9e97384}"; + +Services.telemetry.setEventRecordingEnabled("readermode", true); + +export var ReaderMode = { + DEBUG: 0, + + // For time spent telemetry + enterTime: undefined, + leaveTime: undefined, + + /** + * Enter the reader mode by going forward one step in history if applicable, + * if not, append the about:reader page in the history instead. + */ + enterReaderMode(docShell, win) { + this.enterTime = Date.now(); + + Services.telemetry.recordEvent("readermode", "view", "on", null, { + subcategory: "feature", + }); + + let url = win.document.location.href; + let readerURL = "about:reader?url=" + encodeURIComponent(url); + + if (!Services.appinfo.sessionHistoryInParent) { + let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); + let sh = webNav.sessionHistory; + if (webNav.canGoForward) { + let forwardEntry = sh.legacySHistory.getEntryAtIndex(sh.index + 1); + let forwardURL = forwardEntry.URI.spec; + if (forwardURL && (forwardURL == readerURL || !readerURL)) { + webNav.goForward(); + return; + } + } + } + + // This could possibly move to the parent. See bug 1664982. + win.document.location = readerURL; + }, + + /** + * Exit the reader mode by going back one step in history if applicable, + * if not, append the original page in the history instead. + */ + leaveReaderMode(docShell, win) { + this.leaveTime = Date.now(); + + // Measured in seconds (whole number) + let timeSpentInReaderMode = Math.floor( + (this.leaveTime - this.enterTime) / 1000 + ); + + // Measured as percentage (whole number) + let scrollPosition = Math.floor( + ((win.scrollY + win.innerHeight) / win.document.body.clientHeight) * 100 + ); + + Services.telemetry.recordEvent("readermode", "view", "off", null, { + subcategory: "feature", + reader_time: `${timeSpentInReaderMode}`, + scroll_position: `${scrollPosition}`, + }); + + let url = win.document.location.href; + let originalURL = this.getOriginalUrl(url); + let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); + + if (!Services.appinfo.sessionHistoryInParent) { + let sh = webNav.sessionHistory; + if (webNav.canGoBack) { + let prevEntry = sh.legacySHistory.getEntryAtIndex(sh.index - 1); + let prevURL = prevEntry.URI.spec; + if (prevURL && (prevURL == originalURL || !originalURL)) { + webNav.goBack(); + return; + } + } + } + + let referrerURI, principal; + try { + referrerURI = Services.io.newURI(url); + principal = Services.scriptSecurityManager.createContentPrincipal( + referrerURI, + win.document.nodePrincipal.originAttributes + ); + } catch (e) { + console.error(e); + return; + } + let loadFlags = webNav.LOAD_FLAGS_DISALLOW_INHERIT_PRINCIPAL; + let ReferrerInfo = Components.Constructor( + "@mozilla.org/referrer-info;1", + "nsIReferrerInfo", + "init" + ); + let loadURIOptions = { + triggeringPrincipal: principal, + loadFlags, + referrerInfo: new ReferrerInfo( + Ci.nsIReferrerInfo.EMPTY, + true, + referrerURI + ), + }; + // This could possibly move to the parent. See bug 1664982. + webNav.fixupAndLoadURIString(originalURL, loadURIOptions); + }, + + /** + * Returns original URL from an about:reader URL. + * + * @param url An about:reader URL. + * @return The original URL for the article, or null if we did not find + * a properly formatted about:reader URL. + */ + getOriginalUrl(url) { + if (!url.startsWith("about:reader?")) { + return null; + } + + let outerHash = ""; + try { + let uriObj = Services.io.newURI(url); + url = uriObj.specIgnoringRef; + outerHash = uriObj.ref; + } catch (ex) { + /* ignore, use the raw string */ + } + + let searchParams = new URLSearchParams( + url.substring("about:reader?".length) + ); + if (!searchParams.has("url")) { + return null; + } + let originalUrl = searchParams.get("url"); + if (outerHash) { + try { + let uriObj = Services.io.newURI(originalUrl); + uriObj = Services.io.newURI("#" + outerHash, null, uriObj); + originalUrl = uriObj.spec; + } catch (ex) {} + } + return originalUrl; + }, + + getOriginalUrlObjectForDisplay(url) { + let originalUrl = this.getOriginalUrl(url); + if (originalUrl) { + let uriObj; + try { + uriObj = Services.uriFixup.getFixupURIInfo(originalUrl).preferredURI; + } catch (ex) { + return null; + } + try { + return Services.io.createExposableURI(uriObj); + } catch (ex) { + return null; + } + } + return null; + }, + + /** + * Gets an article from a loaded browser's document. This method will not attempt + * to parse certain URIs (e.g. about: URIs). + * + * @param doc A document to parse. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + */ + parseDocument(doc) { + if ( + !lazy.Readerable.shouldCheckUri(doc.documentURIObject) || + !lazy.Readerable.shouldCheckUri(doc.baseURIObject, true) + ) { + this.log("Reader mode disabled for URI"); + return null; + } + + return this._readerParse(doc); + }, + + /** + * Downloads and parses a document from a URL. + * + * @param url URL to download and parse. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + */ + async downloadAndParseDocument(url, docContentType = "document") { + let result = await this._downloadDocument(url, docContentType); + if (!result?.doc) { + return null; + } + let { doc, newURL } = result; + if ( + !lazy.Readerable.shouldCheckUri(doc.documentURIObject) || + !lazy.Readerable.shouldCheckUri(doc.baseURIObject, true) + ) { + this.log("Reader mode disabled for URI"); + return null; + } + + let article = await this._readerParse(doc); + // If we have to redirect, reject to the caller with the parsed article, + // so we can update the URL before displaying it. + if (newURL) { + return Promise.reject({ newURL, article }); + } + // Otherwise, we can just continue with the article. + return article; + }, + + _downloadDocument(url, docContentType = "document") { + try { + if (!lazy.Readerable.shouldCheckUri(Services.io.newURI(url))) { + return null; + } + } catch (ex) { + console.error( + new Error(`Couldn't create URI from ${url} to download: ${ex}`) + ); + return null; + } + let histogram = Services.telemetry.getHistogramById( + "READER_MODE_DOWNLOAD_RESULT" + ); + return new Promise((resolve, reject) => { + let xhr = new XMLHttpRequest(); + xhr.open("GET", url, true); + xhr.onerror = evt => reject(evt.error); + xhr.responseType = docContentType === "text/plain" ? "text" : "document"; + xhr.onload = evt => { + if (xhr.status !== 200) { + reject("Reader mode XHR failed with status: " + xhr.status); + histogram.add(DOWNLOAD_ERROR_XHR); + return; + } + + let doc = + xhr.responseType === "text" ? xhr.responseText : xhr.responseXML; + if (!doc) { + reject("Reader mode XHR didn't return a document"); + histogram.add(DOWNLOAD_ERROR_NO_DOC); + return; + } + + let responseURL = xhr.responseURL; + let givenURL = url; + // Convert these to real URIs to make sure the escaping (or lack + // thereof) is identical: + try { + responseURL = Services.io.newURI(responseURL).specIgnoringRef; + } catch (ex) { + /* Ignore errors - we'll use what we had before */ + } + try { + givenURL = Services.io.newURI(givenURL).specIgnoringRef; + } catch (ex) { + /* Ignore errors - we'll use what we had before */ + } + + if (xhr.responseType != "document") { + let initialText = doc; + let parser = new DOMParser(); + doc = parser.parseFromString(`<pre></pre>`, "text/html"); + doc.querySelector("pre").textContent = initialText; + } + + // We treat redirects as download successes here: + histogram.add(DOWNLOAD_SUCCESS); + + let result = { doc }; + if (responseURL != givenURL) { + result.newURL = xhr.responseURL; + } + + resolve(result); + }; + xhr.send(); + }); + }, + + log(msg) { + if (this.DEBUG) { + dump("Reader: " + msg); + } + }, + + /** + * Attempts to parse a document into an article. Heavy lifting happens + * in readerWorker.js. + * + * @param doc The document to parse. + * @return {Promise} + * @resolves JS object representing the article, or null if no article is found. + */ + async _readerParse(doc) { + let histogram = Services.telemetry.getHistogramById( + "READER_MODE_PARSE_RESULT" + ); + if (this.parseNodeLimit) { + let numTags = doc.getElementsByTagName("*").length; + if (numTags > this.parseNodeLimit) { + this.log( + "Aborting parse for " + + doc.baseURIObject.spec + + "; " + + numTags + + " elements found" + ); + histogram.add(PARSE_ERROR_TOO_MANY_ELEMENTS); + return null; + } + } + + // Fetch this here before we send `doc` off to the worker thread, as later on the + // document might be nuked but we will still want the URI. + let { documentURI } = doc; + + let uriParam; + uriParam = { + spec: doc.baseURIObject.spec, + prePath: doc.baseURIObject.prePath, + scheme: doc.baseURIObject.scheme, + + // Fallback + host: documentURI, + pathBase: documentURI, + }; + + // nsIURI.host throws an exception if a host doesn't exist. + try { + uriParam.host = doc.baseURIObject.host; + uriParam.pathBase = Services.io.newURI(".", null, doc.baseURIObject).spec; + } catch (ex) { + // Fall back to the initial values we assigned. + console.warn("Error accessing host name: ", ex); + } + + // convert text/plain document, if any, to XHTML format + if (this._isDocumentPlainText(doc)) { + doc = this._convertPlainTextDocument(doc); + } + + let serializer = new XMLSerializer(); + let serializedDoc = serializer.serializeToString(doc); + // Explicitly null out doc to make it clear it might not be available from this + // point on. + doc = null; + + let options = { + classesToPreserve: CLASSES_TO_PRESERVE, + }; + + let article = null; + try { + article = await lazy.ReaderWorker.post("parseDocument", [ + uriParam, + serializedDoc, + options, + ]); + } catch (e) { + console.error("Error in ReaderWorker: ", e); + histogram.add(PARSE_ERROR_WORKER); + } + + if (!article) { + this.log("Worker did not return an article"); + histogram.add(PARSE_ERROR_NO_ARTICLE); + return null; + } + + // Readability returns a URI object based on the baseURI, but we only care + // about the original document's URL from now on. This also avoids spoofing + // attempts where the baseURI doesn't match the domain of the documentURI + article.url = documentURI; + delete article.uri; + + let flags = + Ci.nsIDocumentEncoder.OutputSelectionOnly | + Ci.nsIDocumentEncoder.OutputAbsoluteLinks; + article.title = Cc["@mozilla.org/parserutils;1"] + .getService(Ci.nsIParserUtils) + .convertToPlainText(article.title, flags, 0); + if (gIsFirefoxDesktop) { + await this._assignLanguage(article); + this._maybeAssignTextDirection(article); + } + + this._assignReadTime(article); + + histogram.add(PARSE_SUCCESS); + return article; + }, + + /** + * Sets a global language string value if the result is confident + * + * @return Promise + * @resolves when the language is detected + */ + _assignLanguage(article) { + return lazy.LanguageDetector.detectLanguage(article.textContent).then( + result => { + article.language = result.confident ? result.language : null; + } + ); + }, + + _maybeAssignTextDirection(article) { + // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved. + if ( + !article.dir && + ["ar", "fa", "he", "ug", "ur"].includes(article.language) + ) { + article.dir = "rtl"; + } + }, + + /** + * Assigns the estimated reading time range of the article to the article object. + * + * @param article the article object to assign the reading time estimate to. + */ + _assignReadTime(article) { + let lang = article.language || "en"; + const readingSpeed = this._getReadingSpeedForLanguage(lang); + const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance; + const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance; + const length = article.length; + + article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow); + article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh); + }, + + /** + * Returns the reading speed of a selection of languages with likely variance. + * + * Reading speed estimated from a study done on reading speeds in various languages. + * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061 + * + * @return object with characters per minute and variance. Defaults to English + * if no suitable language is found in the collection. + */ + _getReadingSpeedForLanguage(lang) { + const readingSpeed = new Map([ + ["en", { cpm: 987, variance: 118 }], + ["ar", { cpm: 612, variance: 88 }], + ["de", { cpm: 920, variance: 86 }], + ["es", { cpm: 1025, variance: 127 }], + ["fi", { cpm: 1078, variance: 121 }], + ["fr", { cpm: 998, variance: 126 }], + ["he", { cpm: 833, variance: 130 }], + ["it", { cpm: 950, variance: 140 }], + ["jw", { cpm: 357, variance: 56 }], + ["nl", { cpm: 978, variance: 143 }], + ["pl", { cpm: 916, variance: 126 }], + ["pt", { cpm: 913, variance: 145 }], + ["ru", { cpm: 986, variance: 175 }], + ["sk", { cpm: 885, variance: 145 }], + ["sv", { cpm: 917, variance: 156 }], + ["tr", { cpm: 1054, variance: 156 }], + ["zh", { cpm: 255, variance: 29 }], + ]); + + return readingSpeed.get(lang) || readingSpeed.get("en"); + }, + /** + * + * Check if the document to be parsed is text document. + * @param doc the doc object to be parsed. + * @return boolean + * + */ + _isDocumentPlainText(doc) { + return doc.contentType == "text/plain"; + }, + /** + * + * The document to be parsed is text document and is converted to HTML format. + * @param doc the doc object to be parsed. + * @return doc + * + */ + _convertPlainTextDocument(doc) { + let preTag = doc.querySelector("pre"); + let docFrag = doc.createDocumentFragment(); + let content = preTag.textContent; + let paragraphs = content.split(/\r?\n\r?\n/); + for (let para of paragraphs) { + let pElem = doc.createElement("p"); + let lines = para.split(/\n/); + for (let line of lines) { + pElem.append(line); + let brElem = doc.createElement("br"); + pElem.append(brElem); + } + docFrag.append(pElem); + } + // Clone the document to avoid the original document being affected + // (which shows up when exiting reader mode again). + let clone = doc.documentElement.cloneNode(true); + clone.querySelector("pre").replaceWith(docFrag); + return clone; + }, +}; + +XPCOMUtils.defineLazyPreferenceGetter( + ReaderMode, + "maxElemsToParse", + "reader.parse-node-limit", + 0 +); |