From 0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:47:29 +0200 Subject: Adding upstream version 115.8.0esr. Signed-off-by: Daniel Baumann --- toolkit/components/reader/.eslintrc.js | 13 + toolkit/components/reader/AboutReader.sys.mjs | 1548 +++++++++++++ toolkit/components/reader/JSDOMParser.js | 1203 ++++++++++ .../components/reader/Readability-readerable.js | 116 + toolkit/components/reader/Readability.js | 2290 ++++++++++++++++++++ toolkit/components/reader/ReaderMode.sys.mjs | 561 +++++ toolkit/components/reader/ReaderWorker.js | 61 + toolkit/components/reader/ReaderWorker.sys.mjs | 13 + toolkit/components/reader/Readerable.js | 93 + toolkit/components/reader/Readerable.sys.mjs | 9 + toolkit/components/reader/content/aboutReader.html | 182 ++ toolkit/components/reader/jar.mn | 6 + toolkit/components/reader/moz.build | 28 + toolkit/components/reader/test/browser.ini | 66 + .../test/browser_bug1124271_readerModePinnedTab.js | 53 + .../test/browser_bug1453818_samesite_cookie.js | 132 ++ .../reader/test/browser_drag_url_readerMode.js | 54 + .../reader/test/browser_localfile_readerMode.js | 54 + .../components/reader/test/browser_readerMode.js | 394 ++++ .../reader/test/browser_readerMode_bc_reuse.js | 44 + .../reader/test/browser_readerMode_cached.js | 32 + .../test/browser_readerMode_colorSchemePref.js | 67 + .../reader/test/browser_readerMode_hidden_nodes.js | 56 + .../reader/test/browser_readerMode_menu.js | 69 + .../reader/test/browser_readerMode_pocket.js | 136 ++ .../reader/test/browser_readerMode_readingTime.js | 101 + .../reader/test/browser_readerMode_refresh.js | 49 + .../reader/test/browser_readerMode_remoteType.js | 87 + .../browser_readerMode_samesite_cookie_redirect.js | 50 + .../reader/test/browser_readerMode_with_anchor.js | 89 + toolkit/components/reader/test/getCookies.sjs | 16 + toolkit/components/reader/test/head.js | 63 + .../components/reader/test/linkToGetCookies.html | 13 + .../components/reader/test/readerModeArticle.html | 28 + .../reader/test/readerModeArticleHiddenNodes.html | 22 + .../reader/test/readerModeArticleMedium.html | 16 + .../reader/test/readerModeArticleShort.html | 14 + .../reader/test/readerModeArticleTextPlain.txt | 10 + .../reader/test/readerModeNonArticle.html | 9 + .../components/reader/test/readerModeRandom.sjs | 23 + .../components/reader/test/setSameSiteCookie.html | 9 + .../reader/test/setSameSiteCookie.html^headers^ | 1 + 42 files changed, 7880 insertions(+) create mode 100644 toolkit/components/reader/.eslintrc.js create mode 100644 toolkit/components/reader/AboutReader.sys.mjs create mode 100644 toolkit/components/reader/JSDOMParser.js create mode 100644 toolkit/components/reader/Readability-readerable.js create mode 100644 toolkit/components/reader/Readability.js create mode 100644 toolkit/components/reader/ReaderMode.sys.mjs create mode 100644 toolkit/components/reader/ReaderWorker.js create mode 100644 toolkit/components/reader/ReaderWorker.sys.mjs create mode 100644 toolkit/components/reader/Readerable.js create mode 100644 toolkit/components/reader/Readerable.sys.mjs create mode 100644 toolkit/components/reader/content/aboutReader.html create mode 100644 toolkit/components/reader/jar.mn create mode 100644 toolkit/components/reader/moz.build create mode 100644 toolkit/components/reader/test/browser.ini create mode 100644 toolkit/components/reader/test/browser_bug1124271_readerModePinnedTab.js create mode 100644 toolkit/components/reader/test/browser_bug1453818_samesite_cookie.js create mode 100644 toolkit/components/reader/test/browser_drag_url_readerMode.js create mode 100644 toolkit/components/reader/test/browser_localfile_readerMode.js create mode 100644 toolkit/components/reader/test/browser_readerMode.js create mode 100644 toolkit/components/reader/test/browser_readerMode_bc_reuse.js create mode 100644 toolkit/components/reader/test/browser_readerMode_cached.js create mode 100644 toolkit/components/reader/test/browser_readerMode_colorSchemePref.js create mode 100644 toolkit/components/reader/test/browser_readerMode_hidden_nodes.js create mode 100644 toolkit/components/reader/test/browser_readerMode_menu.js create mode 100644 toolkit/components/reader/test/browser_readerMode_pocket.js create mode 100644 toolkit/components/reader/test/browser_readerMode_readingTime.js create mode 100644 toolkit/components/reader/test/browser_readerMode_refresh.js create mode 100644 toolkit/components/reader/test/browser_readerMode_remoteType.js create mode 100644 toolkit/components/reader/test/browser_readerMode_samesite_cookie_redirect.js create mode 100644 toolkit/components/reader/test/browser_readerMode_with_anchor.js create mode 100644 toolkit/components/reader/test/getCookies.sjs create mode 100644 toolkit/components/reader/test/head.js create mode 100644 toolkit/components/reader/test/linkToGetCookies.html create mode 100644 toolkit/components/reader/test/readerModeArticle.html create mode 100644 toolkit/components/reader/test/readerModeArticleHiddenNodes.html create mode 100644 toolkit/components/reader/test/readerModeArticleMedium.html create mode 100644 toolkit/components/reader/test/readerModeArticleShort.html create mode 100644 toolkit/components/reader/test/readerModeArticleTextPlain.txt create mode 100644 toolkit/components/reader/test/readerModeNonArticle.html create mode 100644 toolkit/components/reader/test/readerModeRandom.sjs create mode 100644 toolkit/components/reader/test/setSameSiteCookie.html create mode 100644 toolkit/components/reader/test/setSameSiteCookie.html^headers^ (limited to 'toolkit/components/reader') diff --git a/toolkit/components/reader/.eslintrc.js b/toolkit/components/reader/.eslintrc.js new file mode 100644 index 0000000000..4df95dd5f8 --- /dev/null +++ b/toolkit/components/reader/.eslintrc.js @@ -0,0 +1,13 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +module.exports = { + rules: { + "no-inner-declarations": "error", + "no-shadow": "error", + "no-unused-vars": ["error", { vars: "all", args: "none" }], + }, +}; diff --git a/toolkit/components/reader/AboutReader.sys.mjs b/toolkit/components/reader/AboutReader.sys.mjs new file mode 100644 index 0000000000..e9bc0bb729 --- /dev/null +++ b/toolkit/components/reader/AboutReader.sys.mjs @@ -0,0 +1,1548 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { ReaderMode } from "resource://gre/modules/ReaderMode.sys.mjs"; + +import { AppConstants } from "resource://gre/modules/AppConstants.sys.mjs"; +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + AsyncPrefs: "resource://gre/modules/AsyncPrefs.sys.mjs", + NarrateControls: "resource://gre/modules/narrate/NarrateControls.sys.mjs", + NimbusFeatures: "resource://nimbus/ExperimentAPI.sys.mjs", +}); + +XPCOMUtils.defineLazyGetter( + lazy, + "numberFormat", + () => new Services.intl.NumberFormat(undefined) +); +XPCOMUtils.defineLazyGetter( + lazy, + "pluralRules", + () => new Services.intl.PluralRules(undefined) +); + +const COLORSCHEME_L10N_IDS = { + light: "about-reader-color-scheme-light", + dark: "about-reader-color-scheme-dark", + sepia: "about-reader-color-scheme-sepia", + auto: "about-reader-color-scheme-auto", +}; + +Services.telemetry.setEventRecordingEnabled("readermode", true); + +const zoomOnCtrl = + Services.prefs.getIntPref("mousewheel.with_control.action", 3) == 3; +const zoomOnMeta = + Services.prefs.getIntPref("mousewheel.with_meta.action", 1) == 3; +const isAppLocaleRTL = Services.locale.isAppLocaleRTL; + +export var AboutReader = function ( + actor, + articlePromise, + docContentType = "document", + docTitle = "" +) { + let win = actor.contentWindow; + let url = this._getOriginalUrl(win); + if ( + !( + url.startsWith("http://") || + url.startsWith("https://") || + url.startsWith("file://") + ) + ) { + let errorMsg = + "Only http://, https:// and file:// URLs can be loaded in about:reader."; + if (Services.prefs.getBoolPref("reader.errors.includeURLs")) { + errorMsg += " Tried to load: " + url + "."; + } + console.error(errorMsg); + win.location.href = "about:blank"; + return; + } + + let doc = win.document; + if (isAppLocaleRTL) { + doc.dir = "rtl"; + } + doc.documentElement.setAttribute("platform", AppConstants.platform); + + doc.title = docTitle; + + this._actor = actor; + this._isLoggedInPocketUser = undefined; + + this._docRef = Cu.getWeakReference(doc); + this._winRef = Cu.getWeakReference(win); + this._innerWindowId = win.windowGlobalChild.innerWindowId; + + this._article = null; + this._languagePromise = new Promise(resolve => { + this._foundLanguage = resolve; + }); + + if (articlePromise) { + this._articlePromise = articlePromise; + } + + this._headerElementRef = Cu.getWeakReference( + doc.querySelector(".reader-header") + ); + this._domainElementRef = Cu.getWeakReference( + doc.querySelector(".reader-domain") + ); + this._titleElementRef = Cu.getWeakReference( + doc.querySelector(".reader-title") + ); + this._readTimeElementRef = Cu.getWeakReference( + doc.querySelector(".reader-estimated-time") + ); + this._creditsElementRef = Cu.getWeakReference( + doc.querySelector(".reader-credits") + ); + this._contentElementRef = Cu.getWeakReference( + doc.querySelector(".moz-reader-content") + ); + this._toolbarContainerElementRef = Cu.getWeakReference( + doc.querySelector(".toolbar-container") + ); + this._toolbarElementRef = Cu.getWeakReference( + doc.querySelector(".reader-controls") + ); + this._messageElementRef = Cu.getWeakReference( + doc.querySelector(".reader-message") + ); + this._containerElementRef = Cu.getWeakReference( + doc.querySelector(".container") + ); + + doc.addEventListener("mousedown", this); + doc.addEventListener("click", this); + doc.addEventListener("touchstart", this); + + win.addEventListener("pagehide", this); + win.addEventListener("resize", this); + win.addEventListener("wheel", this, { passive: false }); + + this.colorSchemeMediaList = win.matchMedia("(prefers-color-scheme: dark)"); + this.colorSchemeMediaList.addEventListener("change", this); + + this.forcedColorsMediaList = win.matchMedia("(forced-colors)"); + this.forcedColorsMediaList.addEventListener("change", this); + + this._topScrollChange = this._topScrollChange.bind(this); + this._intersectionObs = new win.IntersectionObserver(this._topScrollChange, { + root: null, + threshold: [0, 1], + }); + this._intersectionObs.observe(doc.querySelector(".top-anchor")); + + this._ctaIntersectionObserver = new win.IntersectionObserver( + this._pocketCTAObserved.bind(this), + { + threshold: 0.5, + } + ); + + Services.obs.addObserver(this, "inner-window-destroyed"); + + this._setupButton("close-button", this._onReaderClose.bind(this)); + + // we're ready for any external setup, send a signal for that. + this._actor.sendAsyncMessage("Reader:OnSetup"); + + let colorSchemeValues = JSON.parse( + Services.prefs.getCharPref("reader.color_scheme.values") + ); + let colorSchemeOptions = colorSchemeValues.map(value => ({ + l10nId: COLORSCHEME_L10N_IDS[value], + groupName: "color-scheme", + value, + itemClass: value + "-button", + })); + let colorScheme = Services.prefs.getCharPref("reader.color_scheme"); + + this._setupSegmentedButton( + "color-scheme-buttons", + colorSchemeOptions, + colorScheme, + this._setColorSchemePref.bind(this) + ); + this._setColorSchemePref(colorScheme); + + let fontTypeOptions = [ + { + l10nId: "about-reader-font-type-sans-serif", + groupName: "font-type", + value: "sans-serif", + itemClass: "sans-serif-button", + }, + { + l10nId: "about-reader-font-type-serif", + groupName: "font-type", + value: "serif", + itemClass: "serif-button", + }, + ]; + + let fontType = Services.prefs.getCharPref("reader.font_type"); + this._setupSegmentedButton( + "font-type-buttons", + fontTypeOptions, + fontType, + this._setFontType.bind(this) + ); + this._setFontType(fontType); + + this._setupFontSizeButtons(); + + this._setupContentWidthButtons(); + + this._setupLineHeightButtons(); + + if (win.speechSynthesis && Services.prefs.getBoolPref("narrate.enabled")) { + new lazy.NarrateControls(win, this._languagePromise); + } + + this._loadArticle(docContentType); +}; + +AboutReader.prototype = { + _BLOCK_IMAGES_SELECTOR: + ".content p > img:only-child, " + + ".content p > a:only-child > img:only-child, " + + ".content .wp-caption img, " + + ".content figure img", + + _TABLES_SELECTOR: ".content table", + + FONT_SIZE_MIN: 1, + + FONT_SIZE_LEGACY_MAX: 9, + + FONT_SIZE_MAX: 15, + + FONT_SIZE_EXTENDED_VALUES: [32, 40, 56, 72, 96, 128], + + get _doc() { + return this._docRef.get(); + }, + + get _win() { + return this._winRef.get(); + }, + + get _headerElement() { + return this._headerElementRef.get(); + }, + + get _domainElement() { + return this._domainElementRef.get(); + }, + + get _titleElement() { + return this._titleElementRef.get(); + }, + + get _readTimeElement() { + return this._readTimeElementRef.get(); + }, + + get _creditsElement() { + return this._creditsElementRef.get(); + }, + + get _contentElement() { + return this._contentElementRef.get(); + }, + + get _toolbarElement() { + return this._toolbarElementRef.get(); + }, + + get _toolbarContainerElement() { + return this._toolbarContainerElementRef.get(); + }, + + get _messageElement() { + return this._messageElementRef.get(); + }, + + get _containerElement() { + return this._containerElementRef.get(); + }, + + get _isToolbarVertical() { + if (this._toolbarVertical !== undefined) { + return this._toolbarVertical; + } + return (this._toolbarVertical = Services.prefs.getBoolPref( + "reader.toolbar.vertical" + )); + }, + + receiveMessage({ data, name }) { + const doc = this._doc; + switch (name) { + case "Reader:AddButton": { + if (data.id && data.image && !doc.getElementsByClassName(data.id)[0]) { + let btn = doc.createElement("button"); + btn.dataset.buttonid = data.id; + btn.dataset.telemetryId = `reader-${data.telemetryId}`; + btn.className = "toolbar-button " + data.id; + btn.setAttribute("aria-labelledby", "label-" + data.id); + let tip = doc.createElement("span"); + tip.className = "hover-label"; + tip.id = "label-" + data.id; + doc.l10n.setAttributes(tip, data.l10nId); + btn.append(tip); + btn.style.backgroundImage = "url('" + data.image + "')"; + if (data.width && data.height) { + btn.style.backgroundSize = `${data.width}px ${data.height}px`; + } + let tb = this._toolbarElement; + tb.appendChild(btn); + this._setupButton(data.id, button => { + this._actor.sendAsyncMessage( + "Reader:Clicked-" + button.dataset.buttonid, + { article: this._article } + ); + }); + } + break; + } + case "Reader:RemoveButton": { + if (data.id) { + let btn = doc.getElementsByClassName(data.id)[0]; + if (btn) { + btn.remove(); + } + } + break; + } + case "Reader:ZoomIn": { + this._changeFontSize(+1); + break; + } + case "Reader:ZoomOut": { + this._changeFontSize(-1); + break; + } + case "Reader:ResetZoom": { + this._resetFontSize(); + break; + } + } + }, + + handleEvent(aEvent) { + if (!aEvent.isTrusted) { + return; + } + + let target = aEvent.target; + switch (aEvent.type) { + case "touchstart": + /* fall through */ + case "mousedown": + if ( + !target.closest(".dropdown-popup") && + // Skip handling the toggle button here becase + // the dropdown will get toggled with the 'click' event. + !target.classList.contains("dropdown-toggle") + ) { + this._closeDropdowns(); + } + break; + case "click": + const buttonLabel = + target.attributes.getNamedItem(`data-telemetry-id`)?.value; + + if (buttonLabel) { + Services.telemetry.recordEvent( + "readermode", + "button", + "click", + null, + { + label: buttonLabel, + } + ); + } + + if (target.classList.contains("dropdown-toggle")) { + this._toggleDropdownClicked(aEvent); + } + break; + case "scroll": + let lastHeight = this._lastHeight; + let { windowUtils } = this._win; + this._lastHeight = windowUtils.getBoundsWithoutFlushing( + this._doc.body + ).height; + // Only close dropdowns if the scroll events are not a result of line + // height / font-size changes that caused a page height change. + if (lastHeight == this._lastHeight) { + this._closeDropdowns(true); + } + + break; + case "resize": + this._updateImageMargins(); + this._scheduleToolbarOverlapHandler(); + break; + + case "wheel": + let doZoom = + (aEvent.ctrlKey && zoomOnCtrl) || (aEvent.metaKey && zoomOnMeta); + if (!doZoom) { + return; + } + aEvent.preventDefault(); + + // Throttle events to once per 150ms. This avoids excessively fast zooming. + if (aEvent.timeStamp <= this._zoomBackoffTime) { + return; + } + this._zoomBackoffTime = aEvent.timeStamp + 150; + + // Determine the direction of the delta (we don't care about its size); + // This code is adapted from normalizeWheelEventDelta in + // toolkt/components/pdfjs/content/web/viewer.js + let delta = Math.abs(aEvent.deltaX) + Math.abs(aEvent.deltaY); + let angle = Math.atan2(aEvent.deltaY, aEvent.deltaX); + if (-0.25 * Math.PI < angle && angle < 0.75 * Math.PI) { + delta = -delta; + } + + if (delta > 0) { + this._changeFontSize(+1); + } else if (delta < 0) { + this._changeFontSize(-1); + } + break; + + case "pagehide": + this._closeDropdowns(); + + this._actor.readerModeHidden(); + this.clearActor(); + + // Disconnect and delete IntersectionObservers to prevent memory leaks: + + this._intersectionObs.unobserve(this._doc.querySelector(".top-anchor")); + this._ctaIntersectionObserver.disconnect(); + + delete this._intersectionObs; + delete this._ctaIntersectionObserver; + + break; + + case "change": + let colorScheme; + if (this.forcedColorsMediaList.matches) { + colorScheme = "hcm"; + } else { + colorScheme = Services.prefs.getCharPref("reader.color_scheme"); + // We should be changing the color scheme in relation to a preference change + // if the user has the color scheme preference set to "Auto". + if (colorScheme == "auto") { + colorScheme = this.colorSchemeMediaList.matches ? "dark" : "light"; + } + } + this._setColorScheme(colorScheme); + + break; + } + }, + + clearActor() { + this._actor = null; + }, + + _onReaderClose() { + if (this._actor) { + this._actor.closeReaderMode(); + } + }, + + async _resetFontSize() { + await lazy.AsyncPrefs.reset("reader.font_size"); + let currentSize = Services.prefs.getIntPref("reader.font_size"); + this._setFontSize(currentSize); + }, + + _setFontSize(newFontSize) { + this._fontSize = Math.min( + this.FONT_SIZE_MAX, + Math.max(this.FONT_SIZE_MIN, newFontSize) + ); + let size; + if (this._fontSize > this.FONT_SIZE_LEGACY_MAX) { + // -1 because we're indexing into a 0-indexed array, so the first value + // over the legacy max should be 0, the next 1, etc. + let index = this._fontSize - this.FONT_SIZE_LEGACY_MAX - 1; + size = this.FONT_SIZE_EXTENDED_VALUES[index]; + } else { + size = 10 + 2 * this._fontSize; + } + + let readerBody = this._doc.body; + readerBody.style.setProperty("--font-size", size + "px"); + return lazy.AsyncPrefs.set("reader.font_size", this._fontSize); + }, + + _setupFontSizeButtons() { + let plusButton = this._doc.querySelector(".plus-button"); + let minusButton = this._doc.querySelector(".minus-button"); + + let currentSize = Services.prefs.getIntPref("reader.font_size"); + this._setFontSize(currentSize); + this._updateFontSizeButtonControls(); + + plusButton.addEventListener( + "click", + event => { + if (!event.isTrusted) { + return; + } + event.stopPropagation(); + this._changeFontSize(+1); + }, + true + ); + + minusButton.addEventListener( + "click", + event => { + if (!event.isTrusted) { + return; + } + event.stopPropagation(); + this._changeFontSize(-1); + }, + true + ); + }, + + _updateFontSizeButtonControls() { + let plusButton = this._doc.querySelector(".plus-button"); + let minusButton = this._doc.querySelector(".minus-button"); + + let currentSize = this._fontSize; + let fontValue = this._doc.querySelector(".font-size-value"); + fontValue.textContent = currentSize; + + if (currentSize === this.FONT_SIZE_MIN) { + minusButton.setAttribute("disabled", true); + } else { + minusButton.removeAttribute("disabled"); + } + if (currentSize === this.FONT_SIZE_MAX) { + plusButton.setAttribute("disabled", true); + } else { + plusButton.removeAttribute("disabled"); + } + }, + + _changeFontSize(changeAmount) { + let currentSize = + Services.prefs.getIntPref("reader.font_size") + changeAmount; + this._setFontSize(currentSize); + this._updateFontSizeButtonControls(); + this._scheduleToolbarOverlapHandler(); + }, + + _setContentWidth(newContentWidth) { + this._contentWidth = newContentWidth; + this._displayContentWidth(newContentWidth); + let width = 20 + 5 * (this._contentWidth - 1) + "em"; + this._doc.body.style.setProperty("--content-width", width); + this._scheduleToolbarOverlapHandler(); + return lazy.AsyncPrefs.set("reader.content_width", this._contentWidth); + }, + + _displayContentWidth(currentContentWidth) { + let contentWidthValue = this._doc.querySelector(".content-width-value"); + contentWidthValue.textContent = currentContentWidth; + }, + + _setupContentWidthButtons() { + const CONTENT_WIDTH_MIN = 1; + const CONTENT_WIDTH_MAX = 9; + + let currentContentWidth = Services.prefs.getIntPref("reader.content_width"); + currentContentWidth = Math.max( + CONTENT_WIDTH_MIN, + Math.min(CONTENT_WIDTH_MAX, currentContentWidth) + ); + + this._displayContentWidth(currentContentWidth); + + let plusButton = this._doc.querySelector(".content-width-plus-button"); + let minusButton = this._doc.querySelector(".content-width-minus-button"); + + function updateControls() { + if (currentContentWidth === CONTENT_WIDTH_MIN) { + minusButton.setAttribute("disabled", true); + } else { + minusButton.removeAttribute("disabled"); + } + if (currentContentWidth === CONTENT_WIDTH_MAX) { + plusButton.setAttribute("disabled", true); + } else { + plusButton.removeAttribute("disabled"); + } + } + + updateControls(); + this._setContentWidth(currentContentWidth); + + plusButton.addEventListener( + "click", + event => { + if (!event.isTrusted) { + return; + } + event.stopPropagation(); + + if (currentContentWidth >= CONTENT_WIDTH_MAX) { + return; + } + + currentContentWidth++; + updateControls(); + this._setContentWidth(currentContentWidth); + }, + true + ); + + minusButton.addEventListener( + "click", + event => { + if (!event.isTrusted) { + return; + } + event.stopPropagation(); + + if (currentContentWidth <= CONTENT_WIDTH_MIN) { + return; + } + + currentContentWidth--; + updateControls(); + this._setContentWidth(currentContentWidth); + }, + true + ); + }, + + _setLineHeight(newLineHeight) { + this._displayLineHeight(newLineHeight); + let height = 1 + 0.2 * (newLineHeight - 1) + "em"; + this._containerElement.style.setProperty("--line-height", height); + return lazy.AsyncPrefs.set("reader.line_height", newLineHeight); + }, + + _displayLineHeight(currentLineHeight) { + let lineHeightValue = this._doc.querySelector(".line-height-value"); + lineHeightValue.textContent = currentLineHeight; + }, + + _setupLineHeightButtons() { + const LINE_HEIGHT_MIN = 1; + const LINE_HEIGHT_MAX = 9; + + let currentLineHeight = Services.prefs.getIntPref("reader.line_height"); + currentLineHeight = Math.max( + LINE_HEIGHT_MIN, + Math.min(LINE_HEIGHT_MAX, currentLineHeight) + ); + + this._displayLineHeight(currentLineHeight); + + let plusButton = this._doc.querySelector(".line-height-plus-button"); + let minusButton = this._doc.querySelector(".line-height-minus-button"); + + function updateControls() { + if (currentLineHeight === LINE_HEIGHT_MIN) { + minusButton.setAttribute("disabled", true); + } else { + minusButton.removeAttribute("disabled"); + } + if (currentLineHeight === LINE_HEIGHT_MAX) { + plusButton.setAttribute("disabled", true); + } else { + plusButton.removeAttribute("disabled"); + } + } + + updateControls(); + this._setLineHeight(currentLineHeight); + + plusButton.addEventListener( + "click", + event => { + if (!event.isTrusted) { + return; + } + event.stopPropagation(); + + if (currentLineHeight >= LINE_HEIGHT_MAX) { + return; + } + + currentLineHeight++; + updateControls(); + this._setLineHeight(currentLineHeight); + }, + true + ); + + minusButton.addEventListener( + "click", + event => { + if (!event.isTrusted) { + return; + } + event.stopPropagation(); + + if (currentLineHeight <= LINE_HEIGHT_MIN) { + return; + } + + currentLineHeight--; + updateControls(); + this._setLineHeight(currentLineHeight); + }, + true + ); + }, + + _setColorScheme(newColorScheme) { + // There's nothing to change if the new color scheme is the same as our current scheme. + if (this._colorScheme === newColorScheme) { + return; + } + + let bodyClasses = this._doc.body.classList; + + if (this._colorScheme) { + bodyClasses.remove(this._colorScheme); + } + + if (!this._win.matchMedia("(forced-colors)").matches) { + if (newColorScheme === "auto") { + this._colorScheme = this.colorSchemeMediaList.matches + ? "dark" + : "light"; + } else { + this._colorScheme = newColorScheme; + } + } else { + this._colorScheme = "hcm"; + } + + bodyClasses.add(this._colorScheme); + }, + + // Pref values include "dark", "light", "sepia", and "auto" + _setColorSchemePref(colorSchemePref) { + this._setColorScheme(colorSchemePref); + + lazy.AsyncPrefs.set("reader.color_scheme", colorSchemePref); + }, + + _setFontType(newFontType) { + if (this._fontType === newFontType) { + return; + } + + let bodyClasses = this._doc.body.classList; + + if (this._fontType) { + bodyClasses.remove(this._fontType); + } + + this._fontType = newFontType; + bodyClasses.add(this._fontType); + + lazy.AsyncPrefs.set("reader.font_type", this._fontType); + }, + + async _loadArticle(docContentType = "document") { + let url = this._getOriginalUrl(); + this._showProgressDelayed(); + + let article; + if (this._articlePromise) { + article = await this._articlePromise; + } + + if (!article) { + try { + article = await ReaderMode.downloadAndParseDocument( + url, + docContentType + ); + } catch (e) { + if (e?.newURL && this._actor) { + await this._actor.sendQuery("RedirectTo", { + newURL: e.newURL, + article: e.article, + }); + + let readerURL = "about:reader?url=" + encodeURIComponent(e.newURL); + this._win.location.replace(readerURL); + return; + } + } + } + + if (!this._actor) { + return; + } + + // Replace the loading message with an error message if there's a failure. + // Users are supposed to navigate away by themselves (because we cannot + // remove ourselves from session history.) + if (!article) { + this._showError(); + return; + } + + this._showContent(article); + }, + + async _requestPocketLoginStatus() { + let isLoggedIn = await this._actor.sendQuery( + "Reader:PocketLoginStatusRequest" + ); + + return isLoggedIn; + }, + + async _requestPocketArticleInfo(url) { + let articleInfo = await this._actor.sendQuery( + "Reader:PocketGetArticleInfo", + { + url, + } + ); + + return articleInfo?.item_preview?.item_id; + }, + + async _requestPocketArticleRecs(itemID) { + let recs = await this._actor.sendQuery("Reader:PocketGetArticleRecs", { + itemID, + }); + + return recs; + }, + + async _savePocketArticle(url) { + let result = await this._actor.sendQuery("Reader:PocketSaveArticle", { + url, + }); + + return result; + }, + + async _requestFavicon() { + let iconDetails = await this._actor.sendQuery("Reader:FaviconRequest", { + url: this._article.url, + preferredWidth: 16 * this._win.devicePixelRatio, + }); + + if (iconDetails) { + this._loadFavicon(iconDetails.url, iconDetails.faviconUrl); + } + }, + + _loadFavicon(url, faviconUrl) { + if (this._article.url !== url) { + return; + } + + let doc = this._doc; + + let link = doc.createElement("link"); + link.rel = "shortcut icon"; + link.href = faviconUrl; + + doc.getElementsByTagName("head")[0].appendChild(link); + }, + + _updateImageMargins() { + let windowWidth = this._win.innerWidth; + let bodyWidth = this._doc.body.clientWidth; + + let setImageMargins = function (img) { + img.classList.add("moz-reader-block-img"); + + // If the image is at least as wide as the window, make it fill edge-to-edge on mobile. + if (img.naturalWidth >= windowWidth) { + img.setAttribute("moz-reader-full-width", true); + } else { + img.removeAttribute("moz-reader-full-width"); + } + + // If the image is at least half as wide as the body, center it on desktop. + if (img.naturalWidth >= bodyWidth / 2) { + img.setAttribute("moz-reader-center", true); + } else { + img.removeAttribute("moz-reader-center"); + } + }; + + let imgs = this._doc.querySelectorAll(this._BLOCK_IMAGES_SELECTOR); + for (let i = imgs.length; --i >= 0; ) { + let img = imgs[i]; + + if (img.naturalWidth > 0) { + setImageMargins(img); + } else { + img.onload = function () { + setImageMargins(img); + }; + } + } + }, + + _updateWideTables() { + let windowWidth = this._win.innerWidth; + + // Avoid horizontal overflow in the document by making tables that are wider than half browser window's size + // by making it scrollable. + let tables = this._doc.querySelectorAll(this._TABLES_SELECTOR); + for (let i = tables.length; --i >= 0; ) { + let table = tables[i]; + let rect = table.getBoundingClientRect(); + let tableWidth = rect.width; + + if (windowWidth / 2 <= tableWidth) { + table.classList.add("moz-reader-wide-table"); + } + } + }, + + _maybeSetTextDirection: function Read_maybeSetTextDirection(article) { + // Set the article's "dir" on the contents. + // If no direction is specified, the contents should automatically be LTR + // regardless of the UI direction to avoid inheriting the parent's direction + // if the UI is RTL. + this._containerElement.dir = article.dir || "ltr"; + + // The native locale could be set differently than the article's text direction. + this._readTimeElement.dir = isAppLocaleRTL ? "rtl" : "ltr"; + + // This is used to mirror the line height buttons in the toolbar, when relevant. + this._toolbarElement.setAttribute("articledir", article.dir || "ltr"); + }, + + _showError() { + this._headerElement.classList.remove("reader-show-element"); + this._contentElement.classList.remove("reader-show-element"); + + this._doc.l10n.setAttributes( + this._messageElement, + "about-reader-load-error" + ); + this._doc.l10n.setAttributes( + this._doc.getElementById("reader-title"), + "about-reader-load-error" + ); + this._messageElement.style.display = "block"; + + this._doc.documentElement.dataset.isError = true; + + this._error = true; + + this._doc.dispatchEvent( + new this._win.CustomEvent("AboutReaderContentError", { + bubbles: true, + cancelable: false, + }) + ); + }, + + // This function is the JS version of Java's StringUtils.stripCommonSubdomains. + _stripHost(host) { + if (!host) { + return host; + } + + let start = 0; + + if (host.startsWith("www.")) { + start = 4; + } else if (host.startsWith("m.")) { + start = 2; + } else if (host.startsWith("mobile.")) { + start = 7; + } + + return host.substring(start); + }, + + _showContent(article) { + this._messageElement.classList.remove("reader-show-element"); + + this._article = article; + + this._domainElement.href = article.url; + let articleUri = Services.io.newURI(article.url); + + try { + this._domainElement.textContent = this._stripHost(articleUri.host); + } catch (ex) { + let url = this._actor.document.URL; + url = url.substring(url.indexOf("%2F") + 6); + url = url.substring(0, url.indexOf("%2F")); + + this._domainElement.textContent = url; + } + + this._creditsElement.textContent = article.byline; + + this._titleElement.textContent = article.title; + + // TODO: Once formatRange() and selectRange() are available outside Nightly, + // use them here. https://bugzilla.mozilla.org/show_bug.cgi?id=1795317 + const slow = article.readingTimeMinsSlow; + const fast = article.readingTimeMinsFast; + const fastStr = lazy.numberFormat.format(fast); + const slowStr = lazy.numberFormat.format(slow); + this._doc.l10n.setAttributes( + this._readTimeElement, + "about-reader-estimated-read-time", + { + range: fastStr === slowStr ? `~${fastStr}` : `${fastStr}–${slowStr}`, + rangePlural: lazy.pluralRules.select(slow), + } + ); + + // If a document title was not provided in the constructor, we'll fall back + // to using the article title. + if (!this._doc.title) { + this._doc.title = article.title; + } + + this._containerElement.setAttribute("lang", article.lang); + + this._headerElement.classList.add("reader-show-element"); + + let parserUtils = Cc["@mozilla.org/parserutils;1"].getService( + Ci.nsIParserUtils + ); + let contentFragment = parserUtils.parseFragment( + article.content, + Ci.nsIParserUtils.SanitizerDropForms | + Ci.nsIParserUtils.SanitizerAllowStyle, + false, + articleUri, + this._contentElement + ); + this._contentElement.innerHTML = ""; + this._contentElement.appendChild(contentFragment); + this._maybeSetTextDirection(article); + this._foundLanguage(article.language); + + this._contentElement.classList.add("reader-show-element"); + this._updateImageMargins(); + this._updateWideTables(); + + this._requestFavicon(); + this._doc.body.classList.add("loaded"); + + this._goToReference(articleUri.ref); + + Services.obs.notifyObservers(this._win, "AboutReader:Ready"); + + this._doc.dispatchEvent( + new this._win.CustomEvent("AboutReaderContentReady", { + bubbles: true, + cancelable: false, + }) + ); + + // Show Pocket CTA block after article has loaded to prevent it flashing in prematurely + this._setupPocketCTA(); + }, + + _hideContent() { + this._headerElement.classList.remove("reader-show-element"); + this._contentElement.classList.remove("reader-show-element"); + }, + + _showProgressDelayed() { + this._win.setTimeout(() => { + // No need to show progress if the article has been loaded, + // if the window has been unloaded, or if there was an error + // trying to load the article. + if (this._article || !this._actor || this._error) { + return; + } + + this._headerElement.classList.remove("reader-show-element"); + this._contentElement.classList.remove("reader-show-element"); + + this._doc.l10n.setAttributes( + this._messageElement, + "about-reader-loading" + ); + this._messageElement.classList.add("reader-show-element"); + }, 300); + }, + + /** + * Returns the original article URL for this about:reader view. + */ + _getOriginalUrl(win) { + let url = win ? win.location.href : this._win.location.href; + return ReaderMode.getOriginalUrl(url) || url; + }, + + _setupSegmentedButton(id, options, initialValue, callback) { + let doc = this._doc; + let segmentedButton = doc.getElementsByClassName(id)[0]; + + for (let option of options) { + let radioButton = doc.createElement("input"); + radioButton.id = "radio-item" + option.itemClass; + radioButton.type = "radio"; + radioButton.classList.add("radio-button"); + radioButton.name = option.groupName; + segmentedButton.appendChild(radioButton); + + let item = doc.createElement("label"); + item.htmlFor = radioButton.id; + item.classList.add(option.itemClass); + doc.l10n.setAttributes(item, option.l10nId); + + segmentedButton.appendChild(item); + + radioButton.addEventListener( + "input", + function (aEvent) { + if (!aEvent.isTrusted) { + return; + } + + let labels = segmentedButton.children; + for (let label of labels) { + label.removeAttribute("checked"); + } + + aEvent.target.nextElementSibling.setAttribute("checked", "true"); + callback(option.value); + }, + true + ); + + if (option.value === initialValue) { + radioButton.checked = true; + item.setAttribute("checked", "true"); + } + } + }, + + _setupButton(id, callback) { + let button = this._doc.querySelector("." + id); + button.removeAttribute("hidden"); + button.addEventListener( + "click", + function (aEvent) { + if (!aEvent.isTrusted) { + return; + } + + let btn = aEvent.target; + callback(btn); + }, + true + ); + }, + + _toggleDropdownClicked(event) { + let dropdown = event.target.closest(".dropdown"); + + if (!dropdown) { + return; + } + + event.stopPropagation(); + + if (dropdown.classList.contains("open")) { + this._closeDropdowns(); + } else { + this._openDropdown(dropdown); + } + }, + + /* + * If the ReaderView banner font-dropdown is closed, open it. + */ + _openDropdown(dropdown, window) { + if (dropdown.classList.contains("open")) { + return; + } + + this._closeDropdowns(); + + // Get the height of the doc and start handling scrolling: + let { windowUtils } = this._win; + this._lastHeight = windowUtils.getBoundsWithoutFlushing( + this._doc.body + ).height; + this._doc.addEventListener("scroll", this); + + dropdown.classList.add("open"); + this._toolbarElement.classList.add("dropdown-open"); + + this._toolbarContainerElement.classList.add("dropdown-open"); + this._toggleToolbarFixedPosition(true); + }, + + /* + * If the ReaderView has open dropdowns, close them. If we are closing the + * dropdowns because the page is scrolling, allow popups to stay open with + * the keep-open class. + */ + _closeDropdowns(scrolling) { + let selector = ".dropdown.open"; + if (scrolling) { + selector += ":not(.keep-open)"; + } + + let openDropdowns = this._doc.querySelectorAll(selector); + let haveOpenDropdowns = openDropdowns.length; + for (let dropdown of openDropdowns) { + dropdown.classList.remove("open"); + } + this._toolbarElement.classList.remove("dropdown-open"); + + if (haveOpenDropdowns) { + this._toolbarContainerElement.classList.remove("dropdown-open"); + this._toggleToolbarFixedPosition(false); + } + + // Stop handling scrolling: + this._doc.removeEventListener("scroll", this); + }, + + _toggleToolbarFixedPosition(shouldBeFixed) { + let el = this._toolbarContainerElement; + let fontSize = this._doc.body.style.getPropertyValue("--font-size"); + let contentWidth = this._doc.body.style.getPropertyValue("--content-width"); + if (shouldBeFixed) { + el.style.setProperty("--font-size", fontSize); + el.style.setProperty("--content-width", contentWidth); + el.classList.add("transition-location"); + } else { + let expectTransition = + el.style.getPropertyValue("--font-size") != fontSize || + el.style.getPropertyValue("--content-width") != contentWidth; + if (expectTransition) { + el.addEventListener( + "transitionend", + () => el.classList.remove("transition-location"), + { once: true } + ); + } else { + el.classList.remove("transition-location"); + } + el.style.removeProperty("--font-size"); + el.style.removeProperty("--content-width"); + el.classList.remove("overlaps"); + } + }, + + _scheduleToolbarOverlapHandler() { + if (this._enqueuedToolbarOverlapHandler) { + return; + } + this._enqueuedToolbarOverlapHandler = this._win.requestAnimationFrame( + () => { + this._win.setTimeout(() => this._toolbarOverlapHandler(), 0); + } + ); + }, + + _toolbarOverlapHandler() { + delete this._enqueuedToolbarOverlapHandler; + // Ensure the dropdown is still open to avoid racing with that changing. + if (this._toolbarContainerElement.classList.contains("dropdown-open")) { + let { windowUtils } = this._win; + let toolbarBounds = windowUtils.getBoundsWithoutFlushing( + this._toolbarElement.parentNode + ); + let textBounds = windowUtils.getBoundsWithoutFlushing( + this._containerElement + ); + let overlaps = false; + if (isAppLocaleRTL) { + overlaps = textBounds.right > toolbarBounds.left; + } else { + overlaps = textBounds.left < toolbarBounds.right; + } + this._toolbarContainerElement.classList.toggle("overlaps", overlaps); + } + }, + + _topScrollChange(entries) { + if (!entries.length) { + return; + } + // If we don't intersect the item at the top of the document, we're + // scrolled down: + let scrolled = !entries[entries.length - 1].isIntersecting; + let tbc = this._toolbarContainerElement; + tbc.classList.toggle("scrolled", scrolled); + }, + + /* + * Scroll reader view to a reference + */ + _goToReference(ref) { + if (ref) { + if (this._doc.readyState == "complete") { + this._win.location.hash = ref; + } else { + this._win.addEventListener( + "load", + () => { + this._win.location.hash = ref; + }, + { once: true } + ); + } + } + }, + + _enableDismissCTA() { + let elDismissCta = this._doc.querySelector(`.pocket-dismiss-cta`); + + elDismissCta?.addEventListener(`click`, e => { + this._doc.querySelector("#pocket-cta-container").hidden = true; + + Services.telemetry.recordEvent( + "readermode", + "pocket_cta", + "close_cta", + null, + {} + ); + }); + }, + + _enableRecShowHide() { + let elPocketRecs = this._doc.querySelector(`.pocket-recs`); + let elCollapseRecs = this._doc.querySelector(`.pocket-collapse-recs`); + let elSignUp = this._doc.querySelector(`div.pocket-sign-up-wrapper`); + + let toggleRecsVisibility = () => { + let isClosed = elPocketRecs.classList.contains(`closed`); + + isClosed = !isClosed; // Toggle + + if (isClosed) { + elPocketRecs.classList.add(`closed`); + elCollapseRecs.classList.add(`closed`); + elSignUp.setAttribute(`hidden`, true); + + Services.telemetry.recordEvent( + "readermode", + "pocket_cta", + "minimize_recs_click", + null, + {} + ); + } else { + elPocketRecs.classList.remove(`closed`); + elCollapseRecs.classList.remove(`closed`); + elSignUp.removeAttribute(`hidden`); + } + }; + + elCollapseRecs?.addEventListener(`click`, e => { + toggleRecsVisibility(); + }); + }, + + _buildPocketRec(title, url, publisher, thumb, time) { + let fragment = this._doc.createDocumentFragment(); + + let elContainer = this._doc.createElement(`div`); + let elTitle = this._doc.createElement(`header`); + let elMetadata = this._doc.createElement(`p`); + let elThumb = this._doc.createElement(`img`); + let elSideWrap = this._doc.createElement(`div`); + let elTop = this._doc.createElement(`a`); + let elBottom = this._doc.createElement(`div`); + let elAdd = this._doc.createElement(`button`); + + elAdd.classList.add(`pocket-btn-add`); + elBottom.classList.add(`pocket-rec-bottom`); + elTop.classList.add(`pocket-rec-top`); + elSideWrap.classList.add(`pocket-rec-side`); + elContainer.classList.add(`pocket-rec`); + elTitle.classList.add(`pocket-rec-title`); + elMetadata.classList.add(`pocket-rec-meta`); + + elTop.setAttribute(`href`, url); + + elTop.addEventListener(`click`, e => { + Services.telemetry.recordEvent( + "readermode", + "pocket_cta", + "rec_click", + null, + {} + ); + }); + + elThumb.classList.add(`pocket-rec-thumb`); + elThumb.setAttribute(`loading`, `lazy`); + elThumb.addEventListener(`load`, () => { + elThumb.classList.add(`pocket-rec-thumb-loaded`); + }); + elThumb.setAttribute( + `src`, + `https://img-getpocket.cdn.mozilla.net/132x132/filters:format(jpeg):quality(60):no_upscale():strip_exif()/${thumb}` + ); + + elAdd.textContent = `Save`; + elTitle.textContent = title; + + if (publisher && time) { + elMetadata.textContent = `${publisher} · ${time} min`; + } else if (publisher) { + elMetadata.textContent = `${publisher}`; + } else if (time) { + elMetadata.textContent = `${time} min`; + } + + elSideWrap.appendChild(elTitle); + elSideWrap.appendChild(elMetadata); + elTop.appendChild(elSideWrap); + elTop.appendChild(elThumb); + elBottom.appendChild(elAdd); + elContainer.appendChild(elTop); + elContainer.appendChild(elBottom); + fragment.appendChild(elContainer); + + elAdd.addEventListener(`click`, e => { + this._savePocketArticle(url); + elAdd.textContent = `Saved`; + elAdd.classList.add(`saved`); + + Services.telemetry.recordEvent( + "readermode", + "pocket_cta", + "rec_saved", + null, + {} + ); + }); + + return fragment; + }, + + async _getAndBuildPocketRecs() { + let elTarget = this._doc.querySelector(`.pocket-recs`); + let url = this._getOriginalUrl(); + let itemID = await this._requestPocketArticleInfo(url); + let articleRecs = await this._requestPocketArticleRecs(itemID); + + articleRecs.recommendations.forEach(rec => { + // Parse a domain from the article URL in case the Publisher name isn't available + let parsedDomain = new URL(rec.item?.normal_url)?.hostname; + + // Calculate read time from word count in case it's not available + let calculatedReadTime = Math.ceil(rec.item?.word_count / 220); + + let elRec = this._buildPocketRec( + rec.item?.title, + rec.item?.normal_url, + rec.item?.domain_metadata?.name || parsedDomain, + rec.item?.top_image_url, + rec.item?.time_to_read || calculatedReadTime + ); + + elTarget.appendChild(elRec); + }); + }, + + _pocketCTAObserved(entries) { + if (entries && entries[0]?.isIntersecting) { + this._ctaIntersectionObserver.disconnect(); + + Services.telemetry.recordEvent( + "readermode", + "pocket_cta", + "cta_seen", + null, + { + logged_in: `${this._isLoggedInPocketUser}`, + } + ); + } + }, + + async _setupPocketCTA() { + let ctaVersion = + lazy.NimbusFeatures.readerMode.getAllVariables()?.pocketCTAVersion; + this._isLoggedInPocketUser = await this._requestPocketLoginStatus(); + let elPocketCTAWrapper = this._doc.querySelector("#pocket-cta-container"); + + // Show the Pocket CTA container if the pref is set and valid + if (ctaVersion === `cta-and-recs` || ctaVersion === `cta-only`) { + if (ctaVersion === `cta-and-recs` && this._isLoggedInPocketUser) { + this._getAndBuildPocketRecs(); + this._enableRecShowHide(); + } else if (ctaVersion === `cta-and-recs` && !this._isLoggedInPocketUser) { + // Fall back to cta only for logged out users: + ctaVersion = `cta-only`; + } + + if (ctaVersion == `cta-only`) { + this._enableDismissCTA(); + } + + elPocketCTAWrapper.hidden = false; + elPocketCTAWrapper.classList.add(`pocket-cta-container-${ctaVersion}`); + elPocketCTAWrapper.classList.add( + `pocket-cta-container-${ + this._isLoggedInPocketUser ? `logged-in` : `logged-out` + }` + ); + + // Set up tracking for sign up buttons + this._doc + .querySelectorAll(`.pocket-sign-up, .pocket-discover-more`) + .forEach(el => { + el.addEventListener(`click`, e => { + Services.telemetry.recordEvent( + "readermode", + "pocket_cta", + "sign_up_click", + null, + {} + ); + }); + }); + + // Set up tracking for user seeing CTA + this._ctaIntersectionObserver.observe( + this._doc.querySelector(`#pocket-cta-container`) + ); + } + }, +}; diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js new file mode 100644 index 0000000000..a01e71e5c7 --- /dev/null +++ b/toolkit/components/reader/JSDOMParser.js @@ -0,0 +1,1203 @@ +/*eslint-env es6:false*/ +/* + * DO NOT MODIFY THIS FILE DIRECTLY! + * + * This is a shared library that is maintained in an external repo: + * https://github.com/mozilla/readability + */ + +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * This is a relatively lightweight DOMParser that is safe to use in a web + * worker. This is far from a complete DOM implementation; however, it should + * contain the minimal set of functionality necessary for Readability.js. + * + * Aside from not implementing the full DOM API, there are other quirks to be + * aware of when using the JSDOMParser: + * + * 1) Properly formed HTML/XML must be used. This means you should be extra + * careful when using this parser on anything received directly from an + * XMLHttpRequest. Providing a serialized string from an XMLSerializer, + * however, should be safe (since the browser's XMLSerializer should + * generate valid HTML/XML). Therefore, if parsing a document from an XHR, + * the recommended approach is to do the XHR in the main thread, use + * XMLSerializer.serializeToString() on the responseXML, and pass the + * resulting string to the worker. + * + * 2) Live NodeLists are not supported. DOM methods and properties such as + * getElementsByTagName() and childNodes return standard arrays. If you + * want these lists to be updated when nodes are removed or added to the + * document, you must take care to manually update them yourself. + */ +(function (global) { + + // XML only defines these and the numeric ones: + + var entityTable = { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }; + + var reverseEntityTable = { + "<": "<", + ">": ">", + "&": "&", + '"': """, + "'": "'", + }; + + function encodeTextContentHTML(s) { + return s.replace(/[&<>]/g, function(x) { + return reverseEntityTable[x]; + }); + } + + function encodeHTML(s) { + return s.replace(/[&<>'"]/g, function(x) { + return reverseEntityTable[x]; + }); + } + + function decodeHTML(str) { + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) { + return entityTable[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); // read num + return String.fromCharCode(num); + }); + } + + // When a style is set in JS, map it to the corresponding CSS attribute + var styleMap = { + "alignmentBaseline": "alignment-baseline", + "background": "background", + "backgroundAttachment": "background-attachment", + "backgroundClip": "background-clip", + "backgroundColor": "background-color", + "backgroundImage": "background-image", + "backgroundOrigin": "background-origin", + "backgroundPosition": "background-position", + "backgroundPositionX": "background-position-x", + "backgroundPositionY": "background-position-y", + "backgroundRepeat": "background-repeat", + "backgroundRepeatX": "background-repeat-x", + "backgroundRepeatY": "background-repeat-y", + "backgroundSize": "background-size", + "baselineShift": "baseline-shift", + "border": "border", + "borderBottom": "border-bottom", + "borderBottomColor": "border-bottom-color", + "borderBottomLeftRadius": "border-bottom-left-radius", + "borderBottomRightRadius": "border-bottom-right-radius", + "borderBottomStyle": "border-bottom-style", + "borderBottomWidth": "border-bottom-width", + "borderCollapse": "border-collapse", + "borderColor": "border-color", + "borderImage": "border-image", + "borderImageOutset": "border-image-outset", + "borderImageRepeat": "border-image-repeat", + "borderImageSlice": "border-image-slice", + "borderImageSource": "border-image-source", + "borderImageWidth": "border-image-width", + "borderLeft": "border-left", + "borderLeftColor": "border-left-color", + "borderLeftStyle": "border-left-style", + "borderLeftWidth": "border-left-width", + "borderRadius": "border-radius", + "borderRight": "border-right", + "borderRightColor": "border-right-color", + "borderRightStyle": "border-right-style", + "borderRightWidth": "border-right-width", + "borderSpacing": "border-spacing", + "borderStyle": "border-style", + "borderTop": "border-top", + "borderTopColor": "border-top-color", + "borderTopLeftRadius": "border-top-left-radius", + "borderTopRightRadius": "border-top-right-radius", + "borderTopStyle": "border-top-style", + "borderTopWidth": "border-top-width", + "borderWidth": "border-width", + "bottom": "bottom", + "boxShadow": "box-shadow", + "boxSizing": "box-sizing", + "captionSide": "caption-side", + "clear": "clear", + "clip": "clip", + "clipPath": "clip-path", + "clipRule": "clip-rule", + "color": "color", + "colorInterpolation": "color-interpolation", + "colorInterpolationFilters": "color-interpolation-filters", + "colorProfile": "color-profile", + "colorRendering": "color-rendering", + "content": "content", + "counterIncrement": "counter-increment", + "counterReset": "counter-reset", + "cursor": "cursor", + "direction": "direction", + "display": "display", + "dominantBaseline": "dominant-baseline", + "emptyCells": "empty-cells", + "enableBackground": "enable-background", + "fill": "fill", + "fillOpacity": "fill-opacity", + "fillRule": "fill-rule", + "filter": "filter", + "cssFloat": "float", + "floodColor": "flood-color", + "floodOpacity": "flood-opacity", + "font": "font", + "fontFamily": "font-family", + "fontSize": "font-size", + "fontStretch": "font-stretch", + "fontStyle": "font-style", + "fontVariant": "font-variant", + "fontWeight": "font-weight", + "glyphOrientationHorizontal": "glyph-orientation-horizontal", + "glyphOrientationVertical": "glyph-orientation-vertical", + "height": "height", + "imageRendering": "image-rendering", + "kerning": "kerning", + "left": "left", + "letterSpacing": "letter-spacing", + "lightingColor": "lighting-color", + "lineHeight": "line-height", + "listStyle": "list-style", + "listStyleImage": "list-style-image", + "listStylePosition": "list-style-position", + "listStyleType": "list-style-type", + "margin": "margin", + "marginBottom": "margin-bottom", + "marginLeft": "margin-left", + "marginRight": "margin-right", + "marginTop": "margin-top", + "marker": "marker", + "markerEnd": "marker-end", + "markerMid": "marker-mid", + "markerStart": "marker-start", + "mask": "mask", + "maxHeight": "max-height", + "maxWidth": "max-width", + "minHeight": "min-height", + "minWidth": "min-width", + "opacity": "opacity", + "orphans": "orphans", + "outline": "outline", + "outlineColor": "outline-color", + "outlineOffset": "outline-offset", + "outlineStyle": "outline-style", + "outlineWidth": "outline-width", + "overflow": "overflow", + "overflowX": "overflow-x", + "overflowY": "overflow-y", + "padding": "padding", + "paddingBottom": "padding-bottom", + "paddingLeft": "padding-left", + "paddingRight": "padding-right", + "paddingTop": "padding-top", + "page": "page", + "pageBreakAfter": "page-break-after", + "pageBreakBefore": "page-break-before", + "pageBreakInside": "page-break-inside", + "pointerEvents": "pointer-events", + "position": "position", + "quotes": "quotes", + "resize": "resize", + "right": "right", + "shapeRendering": "shape-rendering", + "size": "size", + "speak": "speak", + "src": "src", + "stopColor": "stop-color", + "stopOpacity": "stop-opacity", + "stroke": "stroke", + "strokeDasharray": "stroke-dasharray", + "strokeDashoffset": "stroke-dashoffset", + "strokeLinecap": "stroke-linecap", + "strokeLinejoin": "stroke-linejoin", + "strokeMiterlimit": "stroke-miterlimit", + "strokeOpacity": "stroke-opacity", + "strokeWidth": "stroke-width", + "tableLayout": "table-layout", + "textAlign": "text-align", + "textAnchor": "text-anchor", + "textDecoration": "text-decoration", + "textIndent": "text-indent", + "textLineThrough": "text-line-through", + "textLineThroughColor": "text-line-through-color", + "textLineThroughMode": "text-line-through-mode", + "textLineThroughStyle": "text-line-through-style", + "textLineThroughWidth": "text-line-through-width", + "textOverflow": "text-overflow", + "textOverline": "text-overline", + "textOverlineColor": "text-overline-color", + "textOverlineMode": "text-overline-mode", + "textOverlineStyle": "text-overline-style", + "textOverlineWidth": "text-overline-width", + "textRendering": "text-rendering", + "textShadow": "text-shadow", + "textTransform": "text-transform", + "textUnderline": "text-underline", + "textUnderlineColor": "text-underline-color", + "textUnderlineMode": "text-underline-mode", + "textUnderlineStyle": "text-underline-style", + "textUnderlineWidth": "text-underline-width", + "top": "top", + "unicodeBidi": "unicode-bidi", + "unicodeRange": "unicode-range", + "vectorEffect": "vector-effect", + "verticalAlign": "vertical-align", + "visibility": "visibility", + "whiteSpace": "white-space", + "widows": "widows", + "width": "width", + "wordBreak": "word-break", + "wordSpacing": "word-spacing", + "wordWrap": "word-wrap", + "writingMode": "writing-mode", + "zIndex": "z-index", + "zoom": "zoom", + }; + + // Elements that can be self-closing + var voidElems = { + "area": true, + "base": true, + "br": true, + "col": true, + "command": true, + "embed": true, + "hr": true, + "img": true, + "input": true, + "link": true, + "meta": true, + "param": true, + "source": true, + "wbr": true + }; + + var whitespace = [" ", "\t", "\n", "\r"]; + + // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + var nodeTypes = { + ELEMENT_NODE: 1, + ATTRIBUTE_NODE: 2, + TEXT_NODE: 3, + CDATA_SECTION_NODE: 4, + ENTITY_REFERENCE_NODE: 5, + ENTITY_NODE: 6, + PROCESSING_INSTRUCTION_NODE: 7, + COMMENT_NODE: 8, + DOCUMENT_NODE: 9, + DOCUMENT_TYPE_NODE: 10, + DOCUMENT_FRAGMENT_NODE: 11, + NOTATION_NODE: 12 + }; + + function getElementsByTagName(tag) { + tag = tag.toUpperCase(); + var elems = []; + var allTags = (tag === "*"); + function getElems(node) { + var length = node.children.length; + for (var i = 0; i < length; i++) { + var child = node.children[i]; + if (allTags || (child.tagName === tag)) + elems.push(child); + getElems(child); + } + } + getElems(this); + elems._isLiveNodeList = true; + return elems; + } + + var Node = function () {}; + + Node.prototype = { + attributes: null, + childNodes: null, + localName: null, + nodeName: null, + parentNode: null, + textContent: null, + nextSibling: null, + previousSibling: null, + + get firstChild() { + return this.childNodes[0] || null; + }, + + get firstElementChild() { + return this.children[0] || null; + }, + + get lastChild() { + return this.childNodes[this.childNodes.length - 1] || null; + }, + + get lastElementChild() { + return this.children[this.children.length - 1] || null; + }, + + appendChild: function (child) { + if (child.parentNode) { + child.parentNode.removeChild(child); + } + + var last = this.lastChild; + if (last) + last.nextSibling = child; + child.previousSibling = last; + + if (child.nodeType === Node.ELEMENT_NODE) { + child.previousElementSibling = this.children[this.children.length - 1] || null; + this.children.push(child); + child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child); + } + this.childNodes.push(child); + child.parentNode = this; + }, + + removeChild: function (child) { + var childNodes = this.childNodes; + var childIndex = childNodes.indexOf(child); + if (childIndex === -1) { + throw "removeChild: node not found"; + } else { + child.parentNode = null; + var prev = child.previousSibling; + var next = child.nextSibling; + if (prev) + prev.nextSibling = next; + if (next) + next.previousSibling = prev; + + if (child.nodeType === Node.ELEMENT_NODE) { + prev = child.previousElementSibling; + next = child.nextElementSibling; + if (prev) + prev.nextElementSibling = next; + if (next) + next.previousElementSibling = prev; + this.children.splice(this.children.indexOf(child), 1); + } + + child.previousSibling = child.nextSibling = null; + child.previousElementSibling = child.nextElementSibling = null; + + return childNodes.splice(childIndex, 1)[0]; + } + }, + + replaceChild: function (newNode, oldNode) { + var childNodes = this.childNodes; + var childIndex = childNodes.indexOf(oldNode); + if (childIndex === -1) { + throw "replaceChild: node not found"; + } else { + // This will take care of updating the new node if it was somewhere else before: + if (newNode.parentNode) + newNode.parentNode.removeChild(newNode); + + childNodes[childIndex] = newNode; + + // update the new node's sibling properties, and its new siblings' sibling properties + newNode.nextSibling = oldNode.nextSibling; + newNode.previousSibling = oldNode.previousSibling; + if (newNode.nextSibling) + newNode.nextSibling.previousSibling = newNode; + if (newNode.previousSibling) + newNode.previousSibling.nextSibling = newNode; + + newNode.parentNode = this; + + // Now deal with elements before we clear out those values for the old node, + // because it can help us take shortcuts here: + if (newNode.nodeType === Node.ELEMENT_NODE) { + if (oldNode.nodeType === Node.ELEMENT_NODE) { + // Both were elements, which makes this easier, we just swap things out: + newNode.previousElementSibling = oldNode.previousElementSibling; + newNode.nextElementSibling = oldNode.nextElementSibling; + if (newNode.previousElementSibling) + newNode.previousElementSibling.nextElementSibling = newNode; + if (newNode.nextElementSibling) + newNode.nextElementSibling.previousElementSibling = newNode; + this.children[this.children.indexOf(oldNode)] = newNode; + } else { + // Hard way: + newNode.previousElementSibling = (function() { + for (var i = childIndex - 1; i >= 0; i--) { + if (childNodes[i].nodeType === Node.ELEMENT_NODE) + return childNodes[i]; + } + return null; + })(); + if (newNode.previousElementSibling) { + newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling; + } else { + newNode.nextElementSibling = (function() { + for (var i = childIndex + 1; i < childNodes.length; i++) { + if (childNodes[i].nodeType === Node.ELEMENT_NODE) + return childNodes[i]; + } + return null; + })(); + } + if (newNode.previousElementSibling) + newNode.previousElementSibling.nextElementSibling = newNode; + if (newNode.nextElementSibling) + newNode.nextElementSibling.previousElementSibling = newNode; + + if (newNode.nextElementSibling) + this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode); + else + this.children.push(newNode); + } + } else if (oldNode.nodeType === Node.ELEMENT_NODE) { + // new node is not an element node. + // if the old one was, update its element siblings: + if (oldNode.previousElementSibling) + oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling; + if (oldNode.nextElementSibling) + oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling; + this.children.splice(this.children.indexOf(oldNode), 1); + + // If the old node wasn't an element, neither the new nor the old node was an element, + // and the children array and its members shouldn't need any updating. + } + + + oldNode.parentNode = null; + oldNode.previousSibling = null; + oldNode.nextSibling = null; + if (oldNode.nodeType === Node.ELEMENT_NODE) { + oldNode.previousElementSibling = null; + oldNode.nextElementSibling = null; + } + return oldNode; + } + }, + + __JSDOMParser__: true, + }; + + for (var nodeType in nodeTypes) { + Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType]; + } + + var Attribute = function (name, value) { + this.name = name; + this._value = value; + }; + + Attribute.prototype = { + get value() { + return this._value; + }, + setValue: function(newValue) { + this._value = newValue; + }, + getEncodedValue: function() { + return encodeHTML(this._value); + }, + }; + + var Comment = function () { + this.childNodes = []; + }; + + Comment.prototype = { + __proto__: Node.prototype, + + nodeName: "#comment", + nodeType: Node.COMMENT_NODE + }; + + var Text = function () { + this.childNodes = []; + }; + + Text.prototype = { + __proto__: Node.prototype, + + nodeName: "#text", + nodeType: Node.TEXT_NODE, + get textContent() { + if (typeof this._textContent === "undefined") { + this._textContent = decodeHTML(this._innerHTML || ""); + } + return this._textContent; + }, + get innerHTML() { + if (typeof this._innerHTML === "undefined") { + this._innerHTML = encodeTextContentHTML(this._textContent || ""); + } + return this._innerHTML; + }, + + set innerHTML(newHTML) { + this._innerHTML = newHTML; + delete this._textContent; + }, + set textContent(newText) { + this._textContent = newText; + delete this._innerHTML; + }, + }; + + var Document = function (url) { + this.documentURI = url; + this.styleSheets = []; + this.childNodes = []; + this.children = []; + }; + + Document.prototype = { + __proto__: Node.prototype, + + nodeName: "#document", + nodeType: Node.DOCUMENT_NODE, + title: "", + + getElementsByTagName: getElementsByTagName, + + getElementById: function (id) { + function getElem(node) { + var length = node.children.length; + if (node.id === id) + return node; + for (var i = 0; i < length; i++) { + var el = getElem(node.children[i]); + if (el) + return el; + } + return null; + } + return getElem(this); + }, + + createElement: function (tag) { + var node = new Element(tag); + return node; + }, + + createTextNode: function (text) { + var node = new Text(); + node.textContent = text; + return node; + }, + + get baseURI() { + if (!this.hasOwnProperty("_baseURI")) { + this._baseURI = this.documentURI; + var baseElements = this.getElementsByTagName("base"); + var href = baseElements[0] && baseElements[0].getAttribute("href"); + if (href) { + try { + this._baseURI = (new URL(href, this._baseURI)).href; + } catch (ex) {/* Just fall back to documentURI */} + } + } + return this._baseURI; + }, + }; + + var Element = function (tag) { + // We use this to find the closing tag. + this._matchingTag = tag; + // We're explicitly a non-namespace aware parser, we just pretend it's all HTML. + var lastColonIndex = tag.lastIndexOf(":"); + if (lastColonIndex != -1) { + tag = tag.substring(lastColonIndex + 1); + } + this.attributes = []; + this.childNodes = []; + this.children = []; + this.nextElementSibling = this.previousElementSibling = null; + this.localName = tag.toLowerCase(); + this.tagName = tag.toUpperCase(); + this.style = new Style(this); + }; + + Element.prototype = { + __proto__: Node.prototype, + + nodeType: Node.ELEMENT_NODE, + + getElementsByTagName: getElementsByTagName, + + get className() { + return this.getAttribute("class") || ""; + }, + + set className(str) { + this.setAttribute("class", str); + }, + + get id() { + return this.getAttribute("id") || ""; + }, + + set id(str) { + this.setAttribute("id", str); + }, + + get href() { + return this.getAttribute("href") || ""; + }, + + set href(str) { + this.setAttribute("href", str); + }, + + get src() { + return this.getAttribute("src") || ""; + }, + + set src(str) { + this.setAttribute("src", str); + }, + + get srcset() { + return this.getAttribute("srcset") || ""; + }, + + set srcset(str) { + this.setAttribute("srcset", str); + }, + + get nodeName() { + return this.tagName; + }, + + get innerHTML() { + function getHTML(node) { + var i = 0; + for (i = 0; i < node.childNodes.length; i++) { + var child = node.childNodes[i]; + if (child.localName) { + arr.push("<" + child.localName); + + // serialize attribute list + for (var j = 0; j < child.attributes.length; j++) { + var attr = child.attributes[j]; + // the attribute value will be HTML escaped. + var val = attr.getEncodedValue(); + var quote = (val.indexOf('"') === -1 ? '"' : "'"); + arr.push(" " + attr.name + "=" + quote + val + quote); + } + + if (child.localName in voidElems && !child.childNodes.length) { + // if this is a self-closing element, end it here + arr.push("/>"); + } else { + // otherwise, add its children + arr.push(">"); + getHTML(child); + arr.push(""); + } + } else { + // This is a text node, so asking for innerHTML won't recurse. + arr.push(child.innerHTML); + } + } + } + + // Using Array.join() avoids the overhead from lazy string concatenation. + var arr = []; + getHTML(this); + return arr.join(""); + }, + + set innerHTML(html) { + var parser = new JSDOMParser(); + var node = parser.parse(html); + var i; + for (i = this.childNodes.length; --i >= 0;) { + this.childNodes[i].parentNode = null; + } + this.childNodes = node.childNodes; + this.children = node.children; + for (i = this.childNodes.length; --i >= 0;) { + this.childNodes[i].parentNode = this; + } + }, + + set textContent(text) { + // clear parentNodes for existing children + for (var i = this.childNodes.length; --i >= 0;) { + this.childNodes[i].parentNode = null; + } + + var node = new Text(); + this.childNodes = [ node ]; + this.children = []; + node.textContent = text; + node.parentNode = this; + }, + + get textContent() { + function getText(node) { + var nodes = node.childNodes; + for (var i = 0; i < nodes.length; i++) { + var child = nodes[i]; + if (child.nodeType === 3) { + text.push(child.textContent); + } else { + getText(child); + } + } + } + + // Using Array.join() avoids the overhead from lazy string concatenation. + // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes + var text = []; + getText(this); + return text.join(""); + }, + + getAttribute: function (name) { + for (var i = this.attributes.length; --i >= 0;) { + var attr = this.attributes[i]; + if (attr.name === name) { + return attr.value; + } + } + return undefined; + }, + + setAttribute: function (name, value) { + for (var i = this.attributes.length; --i >= 0;) { + var attr = this.attributes[i]; + if (attr.name === name) { + attr.setValue(value); + return; + } + } + this.attributes.push(new Attribute(name, value)); + }, + + removeAttribute: function (name) { + for (var i = this.attributes.length; --i >= 0;) { + var attr = this.attributes[i]; + if (attr.name === name) { + this.attributes.splice(i, 1); + break; + } + } + }, + + hasAttribute: function (name) { + return this.attributes.some(function (attr) { + return attr.name == name; + }); + }, + }; + + var Style = function (node) { + this.node = node; + }; + + // getStyle() and setStyle() use the style attribute string directly. This + // won't be very efficient if there are a lot of style manipulations, but + // it's the easiest way to make sure the style attribute string and the JS + // style property stay in sync. Readability.js doesn't do many style + // manipulations, so this should be okay. + Style.prototype = { + getStyle: function (styleName) { + var attr = this.node.getAttribute("style"); + if (!attr) + return undefined; + + var styles = attr.split(";"); + for (var i = 0; i < styles.length; i++) { + var style = styles[i].split(":"); + var name = style[0].trim(); + if (name === styleName) + return style[1].trim(); + } + + return undefined; + }, + + setStyle: function (styleName, styleValue) { + var value = this.node.getAttribute("style") || ""; + var index = 0; + do { + var next = value.indexOf(";", index) + 1; + var length = next - index - 1; + var style = (length > 0 ? value.substr(index, length) : value.substr(index)); + if (style.substr(0, style.indexOf(":")).trim() === styleName) { + value = value.substr(0, index).trim() + (next ? " " + value.substr(next).trim() : ""); + break; + } + index = next; + } while (index); + + value += " " + styleName + ": " + styleValue + ";"; + this.node.setAttribute("style", value.trim()); + } + }; + + // For each item in styleMap, define a getter and setter on the style + // property. + for (var jsName in styleMap) { + (function (cssName) { + Style.prototype.__defineGetter__(jsName, function () { + return this.getStyle(cssName); + }); + Style.prototype.__defineSetter__(jsName, function (value) { + this.setStyle(cssName, value); + }); + })(styleMap[jsName]); + } + + var JSDOMParser = function () { + this.currentChar = 0; + + // In makeElementNode() we build up many strings one char at a time. Using + // += for this results in lots of short-lived intermediate strings. It's + // better to build an array of single-char strings and then join() them + // together at the end. And reusing a single array (i.e. |this.strBuf|) + // over and over for this purpose uses less memory than using a new array + // for each string. + this.strBuf = []; + + // Similarly, we reuse this array to return the two arguments from + // makeElementNode(), which saves us from having to allocate a new array + // every time. + this.retPair = []; + + this.errorState = ""; + }; + + JSDOMParser.prototype = { + error: function(m) { + if (typeof dump !== "undefined") { + dump("JSDOMParser error: " + m + "\n"); + } else if (typeof console !== "undefined") { + console.log("JSDOMParser error: " + m + "\n"); + } + this.errorState += m + "\n"; + }, + + /** + * Look at the next character without advancing the index. + */ + peekNext: function () { + return this.html[this.currentChar]; + }, + + /** + * Get the next character and advance the index. + */ + nextChar: function () { + return this.html[this.currentChar++]; + }, + + /** + * Called after a quote character is read. This finds the next quote + * character and returns the text string in between. + */ + readString: function (quote) { + var str; + var n = this.html.indexOf(quote, this.currentChar); + if (n === -1) { + this.currentChar = this.html.length; + str = null; + } else { + str = this.html.substring(this.currentChar, n); + this.currentChar = n + 1; + } + + return str; + }, + + /** + * Called when parsing a node. This finds the next name/value attribute + * pair and adds the result to the attributes list. + */ + readAttribute: function (node) { + var name = ""; + + var n = this.html.indexOf("=", this.currentChar); + if (n === -1) { + this.currentChar = this.html.length; + } else { + // Read until a '=' character is hit; this will be the attribute key + name = this.html.substring(this.currentChar, n); + this.currentChar = n + 1; + } + + if (!name) + return; + + // After a '=', we should see a '"' for the attribute value + var c = this.nextChar(); + if (c !== '"' && c !== "'") { + this.error("Error reading attribute " + name + ", expecting '\"'"); + return; + } + + // Read the attribute value (and consume the matching quote) + var value = this.readString(c); + + node.attributes.push(new Attribute(name, decodeHTML(value))); + + return; + }, + + /** + * Parses and returns an Element node. This is called after a '<' has been + * read. + * + * @returns an array; the first index of the array is the parsed node; + * the second index is a boolean indicating whether this is a void + * Element + */ + makeElementNode: function (retPair) { + var c = this.nextChar(); + + // Read the Element tag name + var strBuf = this.strBuf; + strBuf.length = 0; + while (whitespace.indexOf(c) == -1 && c !== ">" && c !== "/") { + if (c === undefined) + return false; + strBuf.push(c); + c = this.nextChar(); + } + var tag = strBuf.join(""); + + if (!tag) + return false; + + var node = new Element(tag); + + // Read Element attributes + while (c !== "/" && c !== ">") { + if (c === undefined) + return false; + while (whitespace.indexOf(this.html[this.currentChar++]) != -1) { + // Advance cursor to first non-whitespace char. + } + this.currentChar--; + c = this.nextChar(); + if (c !== "/" && c !== ">") { + --this.currentChar; + this.readAttribute(node); + } + } + + // If this is a self-closing tag, read '/>' + var closed = false; + if (c === "/") { + closed = true; + c = this.nextChar(); + if (c !== ">") { + this.error("expected '>' to close " + tag); + return false; + } + } + + retPair[0] = node; + retPair[1] = closed; + return true; + }, + + /** + * If the current input matches this string, advance the input index; + * otherwise, do nothing. + * + * @returns whether input matched string + */ + match: function (str) { + var strlen = str.length; + if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) { + this.currentChar += strlen; + return true; + } + return false; + }, + + /** + * Searches the input until a string is found and discards all input up to + * and including the matched string. + */ + discardTo: function (str) { + var index = this.html.indexOf(str, this.currentChar) + str.length; + if (index === -1) + this.currentChar = this.html.length; + this.currentChar = index; + }, + + /** + * Reads child nodes for the given node. + */ + readChildren: function (node) { + var child; + while ((child = this.readNode())) { + // Don't keep Comment nodes + if (child.nodeType !== 8) { + node.appendChild(child); + } + } + }, + + discardNextComment: function() { + if (this.match("--")) { + this.discardTo("-->"); + } else { + var c = this.nextChar(); + while (c !== ">") { + if (c === undefined) + return null; + if (c === '"' || c === "'") + this.readString(c); + c = this.nextChar(); + } + } + return new Comment(); + }, + + + /** + * Reads the next child node from the input. If we're reading a closing + * tag, or if we've reached the end of input, return null. + * + * @returns the node + */ + readNode: function () { + var c = this.nextChar(); + + if (c === undefined) + return null; + + // Read any text as Text node + var textNode; + if (c !== "<") { + --this.currentChar; + textNode = new Text(); + var n = this.html.indexOf("<", this.currentChar); + if (n === -1) { + textNode.innerHTML = this.html.substring(this.currentChar, this.html.length); + this.currentChar = this.html.length; + } else { + textNode.innerHTML = this.html.substring(this.currentChar, n); + this.currentChar = n; + } + return textNode; + } + + if (this.match("![CDATA[")) { + var endChar = this.html.indexOf("]]>", this.currentChar); + if (endChar === -1) { + this.error("unclosed CDATA section"); + return null; + } + textNode = new Text(); + textNode.textContent = this.html.substring(this.currentChar, endChar); + this.currentChar = endChar + ("]]>").length; + return textNode; + } + + c = this.peekNext(); + + // Read Comment node. Normally, Comment nodes know their inner + // textContent, but we don't really care about Comment nodes (we throw + // them away in readChildren()). So just returning an empty Comment node + // here is sufficient. + if (c === "!" || c === "?") { + // We're still before the ! or ? that is starting this comment: + this.currentChar++; + return this.discardNextComment(); + } + + // If we're reading a closing tag, return null. This means we've reached + // the end of this set of child nodes. + if (c === "/") { + --this.currentChar; + return null; + } + + // Otherwise, we're looking at an Element node + var result = this.makeElementNode(this.retPair); + if (!result) + return null; + + var node = this.retPair[0]; + var closed = this.retPair[1]; + var localName = node.localName; + + // If this isn't a void Element, read its child nodes + if (!closed) { + this.readChildren(node); + var closingTag = ""; + if (!this.match(closingTag)) { + this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length)); + return null; + } + } + + // Only use the first title, because SVG might have other + // title elements which we don't care about (medium.com + // does this, at least). + if (localName === "title" && !this.doc.title) { + this.doc.title = node.textContent.trim(); + } else if (localName === "head") { + this.doc.head = node; + } else if (localName === "body") { + this.doc.body = node; + } else if (localName === "html") { + this.doc.documentElement = node; + } + + return node; + }, + + /** + * Parses an HTML string and returns a JS implementation of the Document. + */ + parse: function (html, url) { + this.html = html; + var doc = this.doc = new Document(url); + this.readChildren(doc); + + // If this is an HTML document, remove root-level children except for the + // node + if (doc.documentElement) { + for (var i = doc.childNodes.length; --i >= 0;) { + var child = doc.childNodes[i]; + if (child !== doc.documentElement) { + doc.removeChild(child); + } + } + } + + return doc; + } + }; + + // Attach the standard DOM types to the global scope + global.Node = Node; + global.Comment = Comment; + global.Document = Document; + global.Element = Element; + global.Text = Text; + + // Attach JSDOMParser to the global scope + global.JSDOMParser = JSDOMParser; + +})(this); + +if (typeof module === "object") { + module.exports = this.JSDOMParser; +} diff --git a/toolkit/components/reader/Readability-readerable.js b/toolkit/components/reader/Readability-readerable.js new file mode 100644 index 0000000000..fe9c89d789 --- /dev/null +++ b/toolkit/components/reader/Readability-readerable.js @@ -0,0 +1,116 @@ +/* eslint-env es6:false */ +/* globals exports */ +/* + * DO NOT MODIFY THIS FILE DIRECTLY! + * + * This is a shared library that is maintained in an external repo: + * https://github.com/mozilla/readability + */ + +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +var REGEXPS = { + // NOTE: These two regular expressions are duplicated in + // Readability.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, +}; + +function isNodeVisible(node) { + // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. + return (!node.style || node.style.display != "none") + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); +} + +/** + * Decides whether or not the document is reader-able without parsing the whole thing. + * @param {Object} options Configuration object. + * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable. + * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable. + * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible. + * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object. + */ +function isProbablyReaderable(doc, options = {}) { + // For backward compatibility reasons 'options' can either be a configuration object or the function used + // to determine if a node is visible. + if (typeof options == "function") { + options = { visibilityChecker: options }; + } + + var defaultOptions = { minScore: 20, minContentLength: 140, visibilityChecker: isNodeVisible }; + options = Object.assign(defaultOptions, options); + + var nodes = doc.querySelectorAll("p, pre, article"); + + // Get
nodes which have
node(s) and append them into the `nodes` variable. + // Some articles' DOM structures might look like + //
+ // Sentences
+ //
+ // Sentences
+ //
+ var brNodes = doc.querySelectorAll("div > br"); + if (brNodes.length) { + var set = new Set(nodes); + [].forEach.call(brNodes, function (node) { + set.add(node.parentNode); + }); + nodes = Array.from(set); + } + + var score = 0; + // This is a little cheeky, we use the accumulator 'score' to decide what to return from + // this callback: + return [].some.call(nodes, function (node) { + if (!options.visibilityChecker(node)) { + return false; + } + + var matchString = node.className + " " + node.id; + if (REGEXPS.unlikelyCandidates.test(matchString) && + !REGEXPS.okMaybeItsACandidate.test(matchString)) { + return false; + } + + if (node.matches("li p")) { + return false; + } + + var textContentLength = node.textContent.trim().length; + if (textContentLength < options.minContentLength) { + return false; + } + + score += Math.sqrt(textContentLength - options.minContentLength); + + if (score > options.minScore) { + return true; + } + return false; + }); +} + +if (typeof module === "object") { + module.exports = isProbablyReaderable; +} diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js new file mode 100644 index 0000000000..44604970c1 --- /dev/null +++ b/toolkit/components/reader/Readability.js @@ -0,0 +1,2290 @@ +/*eslint-env es6:false*/ +/* + * DO NOT MODIFY THIS FILE DIRECTLY! + * + * This is a shared library that is maintained in an external repo: + * https://github.com/mozilla/readability + */ + +/* + * Copyright (c) 2010 Arc90 Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This code is heavily based on Arc90's readability.js (1.7.1) script + * available at: http://code.google.com/p/arc90labs-readability + */ + +/** + * Public constructor. + * @param {HTMLDocument} doc The document to parse. + * @param {Object} options The options object. + */ +function Readability(doc, options) { + // In some older versions, people passed a URI as the first argument. Cope: + if (options && options.documentElement) { + doc = options; + options = arguments[2]; + } else if (!doc || !doc.documentElement) { + throw new Error("First argument to Readability constructor should be a document object."); + } + options = options || {}; + + this._doc = doc; + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; + this._articleTitle = null; + this._articleByline = null; + this._articleDir = null; + this._articleSiteName = null; + this._attempts = []; + + // Configurable options + this._debug = !!options.debug; + this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; + this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; + this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._keepClasses = !!options.keepClasses; + this._serializer = options.serializer || function(el) { + return el.innerHTML; + }; + this._disableJSONLD = !!options.disableJSONLD; + + // Start with all flags set + this._flags = this.FLAG_STRIP_UNLIKELYS | + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; + + + // Control whether log messages are sent to the console + if (this._debug) { + let logNode = function(node) { + if (node.nodeType == node.TEXT_NODE) { + return `${node.nodeName} ("${node.textContent}")`; + } + let attrPairs = Array.from(node.attributes || [], function(attr) { + return `${attr.name}="${attr.value}"`; + }).join(" "); + return `<${node.localName} ${attrPairs}>`; + }; + this.log = function () { + if (typeof dump !== "undefined") { + var msg = Array.prototype.map.call(arguments, function(x) { + return (x && x.nodeName) ? logNode(x) : x; + }).join(" "); + dump("Reader: (Readability) " + msg + "\n"); + } else if (typeof console !== "undefined") { + let args = Array.from(arguments, arg => { + if (arg && arg.nodeType == this.ELEMENT_NODE) { + return logNode(arg); + } + return arg; + }); + args.unshift("Reader: (Readability)"); + console.log.apply(console, args); + } + }; + } else { + this.log = function () {}; + } +} + +Readability.prototype = { + FLAG_STRIP_UNLIKELYS: 0x1, + FLAG_WEIGHT_CLASSES: 0x2, + FLAG_CLEAN_CONDITIONALLY: 0x4, + + // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + ELEMENT_NODE: 1, + TEXT_NODE: 3, + + // Max number of nodes supported by this parser. Default: 0 (no limit) + DEFAULT_MAX_ELEMS_TO_PARSE: 0, + + // The number of top candidates to consider when analysing how + // tight the competition is among candidates. + DEFAULT_N_TOP_CANDIDATES: 5, + + // Element tags to score by default. + DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), + + // The default number of chars an article must have in order to return a result + DEFAULT_CHAR_THRESHOLD: 500, + + // All of the regular expressions in use within readability. + // Defined up here so we don't instantiate them repeatedly in loops. + REGEXPS: { + // NOTE: These two regular expressions are duplicated in + // Readability-readerable.js. Please keep both copies in sync. + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, + + positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, + negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, + byline: /byline|author|dateline|writtenby|p-author/i, + replaceFonts: /<(\/?)font[^>]*>/gi, + normalize: /\s{2,}/g, + videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, + nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, + prevLink: /(prev|earl|old|new|<|«)/i, + tokenize: /\W+/g, + whitespace: /^\s*$/, + hasContent: /\S$/, + hashUrl: /^#.+/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ + }, + + UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + + DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), + + ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], + + PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], + + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + + // The commented out elements qualify as phrasing content but tend to be + // removed by readability when put into paragraphs, so we ignore them here. + PHRASING_ELEMS: [ + // "CANVAS", "IFRAME", "SVG", "VIDEO", + "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", + "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", + "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", + "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", + "SUP", "TEXTAREA", "TIME", "VAR", "WBR" + ], + + // These are the classes that readability sets itself. + CLASSES_TO_PRESERVE: [ "page" ], + + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + + /** + * Run any post-process modifications to article content as necessary. + * + * @param Element + * @return void + **/ + _postProcessContent: function(articleContent) { + // Readability cannot open relative uris so we convert them to absolute uris. + this._fixRelativeUris(articleContent); + + this._simplifyNestedElements(articleContent); + + if (!this._keepClasses) { + // Remove classes. + this._cleanClasses(articleContent); + } + }, + + /** + * Iterates over a NodeList, calls `filterFn` for each node and removes node + * if function returned `true`. + * + * If function is not passed, removes all the nodes in node list. + * + * @param NodeList nodeList The nodes to operate on + * @param Function filterFn the function to use as a filter + * @return void + */ + _removeNodes: function(nodeList, filterFn) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _removeNodes"); + } + for (var i = nodeList.length - 1; i >= 0; i--) { + var node = nodeList[i]; + var parentNode = node.parentNode; + if (parentNode) { + if (!filterFn || filterFn.call(this, node, i, nodeList)) { + parentNode.removeChild(node); + } + } + } + }, + + /** + * Iterates over a NodeList, and calls _setNodeTag for each node. + * + * @param NodeList nodeList The nodes to operate on + * @param String newTagName the new tag name to use + * @return void + */ + _replaceNodeTags: function(nodeList, newTagName) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _replaceNodeTags"); + } + for (const node of nodeList) { + this._setNodeTag(node, newTagName); + } + }, + + /** + * Iterate over a NodeList, which doesn't natively fully implement the Array + * interface. + * + * For convenience, the current object context is applied to the provided + * iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return void + */ + _forEachNode: function(nodeList, fn) { + Array.prototype.forEach.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if any of the provided iterate + * function calls returns true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _someNode: function(nodeList, fn) { + return Array.prototype.some.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if all of the provided iterate + * function calls return true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _everyNode: function(nodeList, fn) { + return Array.prototype.every.call(nodeList, fn, this); + }, + + /** + * Concat all nodelists passed as arguments. + * + * @return ...NodeList + * @return Array + */ + _concatNodeLists: function() { + var slice = Array.prototype.slice; + var args = slice.call(arguments); + var nodeLists = args.map(function(list) { + return slice.call(list); + }); + return Array.prototype.concat.apply([], nodeLists); + }, + + _getAllNodesWithTag: function(node, tagNames) { + if (node.querySelectorAll) { + return node.querySelectorAll(tagNames.join(",")); + } + return [].concat.apply([], tagNames.map(function(tag) { + var collection = node.getElementsByTagName(tag); + return Array.isArray(collection) ? collection : Array.from(collection); + })); + }, + + /** + * Removes the class="" attribute from every element in the given + * subtree, except those that match CLASSES_TO_PRESERVE and + * the classesToPreserve array from the options object. + * + * @param Element + * @return void + */ + _cleanClasses: function(node) { + var classesToPreserve = this._classesToPreserve; + var className = (node.getAttribute("class") || "") + .split(/\s+/) + .filter(function(cls) { + return classesToPreserve.indexOf(cls) != -1; + }) + .join(" "); + + if (className) { + node.setAttribute("class", className); + } else { + node.removeAttribute("class"); + } + + for (node = node.firstElementChild; node; node = node.nextElementSibling) { + this._cleanClasses(node); + } + }, + + /** + * Converts each and uri in the given element to an absolute URI, + * ignoring #ref URIs. + * + * @param Element + * @return void + */ + _fixRelativeUris: function(articleContent) { + var baseURI = this._doc.baseURI; + var documentURI = this._doc.documentURI; + function toAbsoluteURI(uri) { + // Leave hash links alone if the base URI matches the document URI: + if (baseURI == documentURI && uri.charAt(0) == "#") { + return uri; + } + + // Otherwise, resolve against base URI: + try { + return new URL(uri, baseURI).href; + } catch (ex) { + // Something went wrong, just return the original: + } + return uri; + } + + var links = this._getAllNodesWithTag(articleContent, ["a"]); + this._forEachNode(links, function(link) { + var href = link.getAttribute("href"); + if (href) { + // Remove links with javascript: URIs, since + // they won't work after scripts have been removed from the page. + if (href.indexOf("javascript:") === 0) { + // if the link only contains simple text content, it can be converted to a text node + if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + var text = this._doc.createTextNode(link.textContent); + link.parentNode.replaceChild(text, link); + } else { + // if the link has multiple children, they should all be preserved + var container = this._doc.createElement("span"); + while (link.firstChild) { + container.appendChild(link.firstChild); + } + link.parentNode.replaceChild(container, link); + } + } else { + link.setAttribute("href", toAbsoluteURI(href)); + } + } + }); + + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + + if (src) { + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); + } + }); + }, + + _simplifyNestedElements: function(articleContent) { + var node = articleContent; + + while (node) { + if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if (this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + var child = node.children[0]; + for (var i = 0; i < node.attributes.length; i++) { + child.setAttribute(node.attributes[i].name, node.attributes[i].value); + } + node.parentNode.replaceChild(child, node); + node = child; + continue; + } + } + + node = this._getNextNode(node); + } + }, + + /** + * Get the article title as an H1. + * + * @return string + **/ + _getArticleTitle: function() { + var doc = this._doc; + var curTitle = ""; + var origTitle = ""; + + try { + curTitle = origTitle = doc.title.trim(); + + // If they had an element with id "title" in their HTML + if (typeof curTitle !== "string") + curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); + } catch (e) {/* ignore exceptions setting the title. */} + + var titleHadHierarchicalSeparators = false; + function wordCount(str) { + return str.split(/\s+/).length; + } + + // If there's a separator in the title, first remove the final part + if ((/ [\|\-\\\/>»] /).test(curTitle)) { + titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); + curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); + + // If the resulting title is too short (3 words or fewer), remove + // the first part instead: + if (wordCount(curTitle) < 3) + curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); + } else if (curTitle.indexOf(": ") !== -1) { + // Check if we have an heading containing this exact string, so we + // could assume it's the full title. + var headings = this._concatNodeLists( + doc.getElementsByTagName("h1"), + doc.getElementsByTagName("h2") + ); + var trimmedTitle = curTitle.trim(); + var match = this._someNode(headings, function(heading) { + return heading.textContent.trim() === trimmedTitle; + }); + + // If we don't, let's extract the title out of the original title string. + if (!match) { + curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); + + // If the title is now too short, try the first colon instead: + if (wordCount(curTitle) < 3) { + curTitle = origTitle.substring(origTitle.indexOf(":") + 1); + // But if we have too many words before the colon there's something weird + // with the titles and the H tags so let's just use the original title instead + } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { + curTitle = origTitle; + } + } + } else if (curTitle.length > 150 || curTitle.length < 15) { + var hOnes = doc.getElementsByTagName("h1"); + + if (hOnes.length === 1) + curTitle = this._getInnerText(hOnes[0]); + } + + curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); + // If we now have 4 words or fewer as our title, and either no + // 'hierarchical' separators (\, /, > or ») were found in the original + // title or we decreased the number of words by more than 1 word, use + // the original title. + var curTitleWordCount = wordCount(curTitle); + if (curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + curTitle = origTitle; + } + + return curTitle; + }, + + /** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + * + * @return void + **/ + _prepDocument: function() { + var doc = this._doc; + + // Remove all style tags in head + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); + + if (doc.body) { + this._replaceBrs(doc.body); + } + + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); + }, + + /** + * Finds the next node, starting from the given node, and ignoring + * whitespace in between. If the given node is an element, the same node is + * returned. + */ + _nextNode: function (node) { + var next = node; + while (next + && (next.nodeType != this.ELEMENT_NODE) + && this.REGEXPS.whitespace.test(next.textContent)) { + next = next.nextSibling; + } + return next; + }, + + /** + * Replaces 2 or more successive
elements with a single

. + * Whitespace between
elements are ignored. For example: + *

foo
bar


abc
+ * will become: + *
foo
bar

abc

+ */ + _replaceBrs: function (elem) { + this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { + var next = br.nextSibling; + + // Whether 2 or more
elements have been found and replaced with a + //

block. + var replaced = false; + + // If we find a
chain, remove the
s until we hit another node + // or non-whitespace. This leaves behind the first
in the chain + // (which will be replaced with a

later). + while ((next = this._nextNode(next)) && (next.tagName == "BR")) { + replaced = true; + var brSibling = next.nextSibling; + next.parentNode.removeChild(next); + next = brSibling; + } + + // If we removed a
chain, replace the remaining
with a

. Add + // all sibling nodes as children of the

until we hit another
+ // chain. + if (replaced) { + var p = this._doc.createElement("p"); + br.parentNode.replaceChild(p, br); + + next = p.nextSibling; + while (next) { + // If we've hit another

, we're done adding children to this

. + if (next.tagName == "BR") { + var nextElem = this._nextNode(next.nextSibling); + if (nextElem && nextElem.tagName == "BR") + break; + } + + if (!this._isPhrasingContent(next)) + break; + + // Otherwise, make this node a child of the new

. + var sibling = next.nextSibling; + p.appendChild(next); + next = sibling; + } + + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + + if (p.parentNode.tagName === "P") + this._setNodeTag(p.parentNode, "DIV"); + } + }); + }, + + _setNodeTag: function (node, tag) { + this.log("_setNodeTag", node, tag); + if (this._docJSDOMParser) { + node.localName = tag.toLowerCase(); + node.tagName = tag.toUpperCase(); + return node; + } + + var replacement = node.ownerDocument.createElement(tag); + while (node.firstChild) { + replacement.appendChild(node.firstChild); + } + node.parentNode.replaceChild(replacement, node); + if (node.readability) + replacement.readability = node.readability; + + for (var i = 0; i < node.attributes.length; i++) { + try { + replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + } catch (ex) { + /* it's possible for setAttribute() to throw if the attribute name + * isn't a valid XML Name. Such attributes can however be parsed from + * source in HTML docs, see https://github.com/whatwg/html/issues/4275, + * so we can hit them here and then throw. We don't care about such + * attributes so we ignore them. + */ + } + } + return replacement; + }, + + /** + * Prepare the article node for display. Clean out any inline styles, + * iframes, forms, strip extraneous

tags, etc. + * + * @param Element + * @return void + **/ + _prepArticle: function(articleContent) { + this._cleanStyles(articleContent); + + // Check for data tables before we continue, to avoid removing items in + // those tables, which will often be isolated even though they're + // visually linked to other content-ful elements (text, images, etc.). + this._markDataTables(articleContent); + + this._fixLazyImages(articleContent); + + // Clean out junk from the article content + this._cleanConditionally(articleContent, "form"); + this._cleanConditionally(articleContent, "fieldset"); + this._clean(articleContent, "object"); + this._clean(articleContent, "embed"); + this._clean(articleContent, "footer"); + this._clean(articleContent, "link"); + this._clean(articleContent, "aside"); + + // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, + // which means we don't remove the top candidates even they have "share". + + var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; + + this._forEachNode(articleContent.children, function (topCandidate) { + this._cleanMatchedNodes(topCandidate, function (node, matchString) { + return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + }); + }); + + this._clean(articleContent, "iframe"); + this._clean(articleContent, "input"); + this._clean(articleContent, "textarea"); + this._clean(articleContent, "select"); + this._clean(articleContent, "button"); + this._cleanHeaders(articleContent); + + // Do these last as the previous stuff may have removed junk + // that will affect these + this._cleanConditionally(articleContent, "table"); + this._cleanConditionally(articleContent, "ul"); + this._cleanConditionally(articleContent, "div"); + + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + + // Remove extra paragraphs + this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { + var imgCount = paragraph.getElementsByTagName("img").length; + var embedCount = paragraph.getElementsByTagName("embed").length; + var objectCount = paragraph.getElementsByTagName("object").length; + // At this point, nasty iframes have been removed, only remain embedded video ones. + var iframeCount = paragraph.getElementsByTagName("iframe").length; + var totalCount = imgCount + embedCount + objectCount + iframeCount; + + return totalCount === 0 && !this._getInnerText(paragraph, false); + }); + + this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { + var next = this._nextNode(br.nextSibling); + if (next && next.tagName == "P") + br.parentNode.removeChild(br); + }); + + // Remove single-cell tables + this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); + table.parentNode.replaceChild(cell, table); + } + } + }); + }, + + /** + * Initialize a node with the readability object. Also checks the + * className/id for special names to add to its score. + * + * @param Element + * @return void + **/ + _initializeNode: function(node) { + node.readability = {"contentScore": 0}; + + switch (node.tagName) { + case "DIV": + node.readability.contentScore += 5; + break; + + case "PRE": + case "TD": + case "BLOCKQUOTE": + node.readability.contentScore += 3; + break; + + case "ADDRESS": + case "OL": + case "UL": + case "DL": + case "DD": + case "DT": + case "LI": + case "FORM": + node.readability.contentScore -= 3; + break; + + case "H1": + case "H2": + case "H3": + case "H4": + case "H5": + case "H6": + case "TH": + node.readability.contentScore -= 5; + break; + } + + node.readability.contentScore += this._getClassWeight(node); + }, + + _removeAndGetNext: function(node) { + var nextNode = this._getNextNode(node, true); + node.parentNode.removeChild(node); + return nextNode; + }, + + /** + * Traverse the DOM from node to node, starting at the node passed in. + * Pass true for the second parameter to indicate this node itself + * (and its kids) are going away, and we want the next node over. + * + * Calling this in a loop will traverse the DOM depth-first. + */ + _getNextNode: function(node, ignoreSelfAndKids) { + // First check for kids if those aren't being ignored + if (!ignoreSelfAndKids && node.firstElementChild) { + return node.firstElementChild; + } + // Then for siblings... + if (node.nextElementSibling) { + return node.nextElementSibling; + } + // And finally, move up the parent chain *and* find a sibling + // (because this is depth-first traversal, we will have already + // seen the parent nodes themselves). + do { + node = node.parentNode; + } while (node && !node.nextElementSibling); + return node && node.nextElementSibling; + }, + + // compares second text to first one + // 1 = same text, 0 = completely different text + // works the way that it splits both texts into words and then finds words that are unique in second text + // the result is given by the lower length of unique parts + _textSimilarity: function(textA, textB) { + var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + if (!tokensA.length || !tokensB.length) { + return 0; + } + var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; + return 1 - distanceB; + }, + + _checkByline: function(node, matchString) { + if (this._articleByline) { + return false; + } + + if (node.getAttribute !== undefined) { + var rel = node.getAttribute("rel"); + var itemprop = node.getAttribute("itemprop"); + } + + if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + this._articleByline = node.textContent.trim(); + return true; + } + + return false; + }, + + _getNodeAncestors: function(node, maxDepth) { + maxDepth = maxDepth || 0; + var i = 0, ancestors = []; + while (node.parentNode) { + ancestors.push(node.parentNode); + if (maxDepth && ++i === maxDepth) + break; + node = node.parentNode; + } + return ancestors; + }, + + /*** + * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is + * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. + * + * @param page a document to run upon. Needs to be a full document, complete with body. + * @return Element + **/ + _grabArticle: function (page) { + this.log("**** grabArticle ****"); + var doc = this._doc; + var isPaging = page !== null; + page = page ? page : this._doc.body; + + // We can't grab an article if we don't have a page! + if (!page) { + this.log("No body found in document. Abort."); + return null; + } + + var pageCacheHtml = page.innerHTML; + + while (true) { + this.log("Starting grabArticle loop"); + var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); + + // First, node prepping. Trash nodes that look cruddy (like ones with the + // class name "comment", etc), and turn divs into P tags where they have been + // used inappropriately (as in, where they contain no other block level elements.) + var elementsToScore = []; + var node = this._doc.documentElement; + + let shouldRemoveTitleHeader = true; + + while (node) { + + if (node.tagName === "HTML") { + this._articleLang = node.getAttribute("lang"); + } + + var matchString = node.className + " " + node.id; + + if (!this._isProbablyVisible(node)) { + this.log("Removing hidden node - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + + // Check to see if this node is a byline, and remove it if it is. + if (this._checkByline(node, matchString)) { + node = this._removeAndGetNext(node); + continue; + } + + if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { + this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + shouldRemoveTitleHeader = false; + node = this._removeAndGetNext(node); + continue; + } + + // Remove unlikely candidates + if (stripUnlikelyCandidates) { + if (this.REGEXPS.unlikelyCandidates.test(matchString) && + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && + node.tagName !== "BODY" && + node.tagName !== "A") { + this.log("Removing unlikely candidate - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { + this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + node = this._removeAndGetNext(node); + continue; + } + } + + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). + if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || + node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || + node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && + this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } + + if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { + elementsToScore.push(node); + } + + // Turn all divs that don't have children block level elements into p's + if (node.tagName === "DIV") { + // Put phrasing content into paragraphs. + var p = null; + var childNode = node.firstChild; + while (childNode) { + var nextSibling = childNode.nextSibling; + if (this._isPhrasingContent(childNode)) { + if (p !== null) { + p.appendChild(childNode); + } else if (!this._isWhitespace(childNode)) { + p = doc.createElement("p"); + node.replaceChild(p, childNode); + p.appendChild(childNode); + } + } else if (p !== null) { + while (p.lastChild && this._isWhitespace(p.lastChild)) { + p.removeChild(p.lastChild); + } + p = null; + } + childNode = nextSibling; + } + + // Sites like http://mobile.slate.com encloses each paragraph with a DIV + // element. DIVs with only a P element inside and no text content can be + // safely converted into plain P elements to avoid confusing the scoring + // algorithm with DIVs with are, in practice, paragraphs. + if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { + var newNode = node.children[0]; + node.parentNode.replaceChild(newNode, node); + node = newNode; + elementsToScore.push(node); + } else if (!this._hasChildBlockElement(node)) { + node = this._setNodeTag(node, "P"); + elementsToScore.push(node); + } + } + node = this._getNextNode(node); + } + + /** + * Loop through all paragraphs, and assign a score to them based on how content-y they look. + * Then add their score to their parent node. + * + * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. + **/ + var candidates = []; + this._forEachNode(elementsToScore, function(elementToScore) { + if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") + return; + + // If this paragraph is less than 25 characters, don't even count it. + var innerText = this._getInnerText(elementToScore); + if (innerText.length < 25) + return; + + // Exclude nodes with no ancestor. + var ancestors = this._getNodeAncestors(elementToScore, 5); + if (ancestors.length === 0) + return; + + var contentScore = 0; + + // Add a point for the paragraph itself as a base. + contentScore += 1; + + // Add points for any commas within this paragraph. + contentScore += innerText.split(",").length; + + // For every 100 characters in this paragraph, add another point. Up to 3 points. + contentScore += Math.min(Math.floor(innerText.length / 100), 3); + + // Initialize and score ancestors. + this._forEachNode(ancestors, function(ancestor, level) { + if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") + return; + + if (typeof(ancestor.readability) === "undefined") { + this._initializeNode(ancestor); + candidates.push(ancestor); + } + + // Node score divider: + // - parent: 1 (no division) + // - grandparent: 2 + // - great grandparent+: ancestor level * 3 + if (level === 0) + var scoreDivider = 1; + else if (level === 1) + scoreDivider = 2; + else + scoreDivider = level * 3; + ancestor.readability.contentScore += contentScore / scoreDivider; + }); + }); + + // After we've calculated scores, loop through all of the possible + // candidate nodes we found and find the one with the highest score. + var topCandidates = []; + for (var c = 0, cl = candidates.length; c < cl; c += 1) { + var candidate = candidates[c]; + + // Scale the final candidates score based on link density. Good content + // should have a relatively small link density (5% or less) and be mostly + // unaffected by this operation. + var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); + candidate.readability.contentScore = candidateScore; + + this.log("Candidate:", candidate, "with score " + candidateScore); + + for (var t = 0; t < this._nbTopCandidates; t++) { + var aTopCandidate = topCandidates[t]; + + if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { + topCandidates.splice(t, 0, candidate); + if (topCandidates.length > this._nbTopCandidates) + topCandidates.pop(); + break; + } + } + } + + var topCandidate = topCandidates[0] || null; + var neededToCreateTopCandidate = false; + var parentOfTopCandidate; + + // If we still have no top candidate, just use the body as a last resort. + // We also have to copy the body node so it is something we can modify. + if (topCandidate === null || topCandidate.tagName === "BODY") { + // Move all of the page's children into topCandidate + topCandidate = doc.createElement("DIV"); + neededToCreateTopCandidate = true; + // Move everything (not just elements, also text nodes etc.) into the container + // so we even include text directly in the body: + while (page.firstChild) { + this.log("Moving child out:", page.firstChild); + topCandidate.appendChild(page.firstChild); + } + + page.appendChild(topCandidate); + + this._initializeNode(topCandidate); + } else if (topCandidate) { + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array + // and whose scores are quite closed with current `topCandidate` node. + var alternativeCandidateAncestors = []; + for (var i = 1; i < topCandidates.length; i++) { + if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { + alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); + } + } + var MINIMUM_TOPCANDIDATES = 3; + if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName !== "BODY") { + var listsContainingThisAncestor = 0; + for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { + listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); + } + if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { + topCandidate = parentOfTopCandidate; + break; + } + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + + // Because of our bonus system, parents of candidates might have scores + // themselves. They get half of the node. There won't be nodes with higher + // scores than our topCandidate, but if we see the score going *up* in the first + // few steps up the tree, that's a decent sign that there might be more content + // lurking in other places that we want to unify in. The sibling stuff + // below does some of that - but only if we've looked high enough up the DOM + // tree. + parentOfTopCandidate = topCandidate.parentNode; + var lastScore = topCandidate.readability.contentScore; + // The scores shouldn't get too low. + var scoreThreshold = lastScore / 3; + while (parentOfTopCandidate.tagName !== "BODY") { + if (!parentOfTopCandidate.readability) { + parentOfTopCandidate = parentOfTopCandidate.parentNode; + continue; + } + var parentScore = parentOfTopCandidate.readability.contentScore; + if (parentScore < scoreThreshold) + break; + if (parentScore > lastScore) { + // Alright! We found a better parent to use. + topCandidate = parentOfTopCandidate; + break; + } + lastScore = parentOfTopCandidate.readability.contentScore; + parentOfTopCandidate = parentOfTopCandidate.parentNode; + } + + // If the top candidate is the only child, use parent instead. This will help sibling + // joining logic when adjacent content is actually located in parent's sibling node. + parentOfTopCandidate = topCandidate.parentNode; + while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { + topCandidate = parentOfTopCandidate; + parentOfTopCandidate = topCandidate.parentNode; + } + if (!topCandidate.readability) { + this._initializeNode(topCandidate); + } + } + + // Now that we have the top candidate, look through its siblings for content + // that might also be related. Things like preambles, content split by ads + // that we removed, etc. + var articleContent = doc.createElement("DIV"); + if (isPaging) + articleContent.id = "readability-content"; + + var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); + // Keep potential top candidate's parent node to try to get text direction of it later. + parentOfTopCandidate = topCandidate.parentNode; + var siblings = parentOfTopCandidate.children; + + for (var s = 0, sl = siblings.length; s < sl; s++) { + var sibling = siblings[s]; + var append = false; + + this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); + this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); + + if (sibling === topCandidate) { + append = true; + } else { + var contentBonus = 0; + + // Give a bonus if sibling nodes and top candidates have the example same classname + if (sibling.className === topCandidate.className && topCandidate.className !== "") + contentBonus += topCandidate.readability.contentScore * 0.2; + + if (sibling.readability && + ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { + append = true; + } else if (sibling.nodeName === "P") { + var linkDensity = this._getLinkDensity(sibling); + var nodeContent = this._getInnerText(sibling); + var nodeLength = nodeContent.length; + + if (nodeLength > 80 && linkDensity < 0.25) { + append = true; + } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1) { + append = true; + } + } + } + + if (append) { + this.log("Appending node:", sibling); + + if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { + // We have a node that isn't a common block level element, like a form or td tag. + // Turn it into a div so it doesn't get filtered out later by accident. + this.log("Altering sibling:", sibling, "to div."); + + sibling = this._setNodeTag(sibling, "DIV"); + } + + articleContent.appendChild(sibling); + // Fetch children again to make it compatible + // with DOM parsers without live collection support. + siblings = parentOfTopCandidate.children; + // siblings is a reference to the children array, and + // sibling is removed from the array when we call appendChild(). + // As a result, we must revisit this index since the nodes + // have been shifted. + s -= 1; + sl -= 1; + } + } + + if (this._debug) + this.log("Article content pre-prep: " + articleContent.innerHTML); + // So we have all of the content that we need. Now we clean it up for presentation. + this._prepArticle(articleContent); + if (this._debug) + this.log("Article content post-prep: " + articleContent.innerHTML); + + if (neededToCreateTopCandidate) { + // We already created a fake div thing, and there wouldn't have been any siblings left + // for the previous loop, so there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class names here. No need to append + // because that already happened anyway. + topCandidate.id = "readability-page-1"; + topCandidate.className = "page"; + } else { + var div = doc.createElement("DIV"); + div.id = "readability-page-1"; + div.className = "page"; + while (articleContent.firstChild) { + div.appendChild(articleContent.firstChild); + } + articleContent.appendChild(div); + } + + if (this._debug) + this.log("Article content after paging: " + articleContent.innerHTML); + + var parseSuccessful = true; + + // Now that we've gone through the full algorithm, check to see if + // we got any meaningful content. If we didn't, we may need to re-run + // grabArticle with different flags set. This gives us a higher likelihood of + // finding the content, and the sieve approach gives us a higher likelihood of + // finding the -right- content. + var textLength = this._getInnerText(articleContent, true).length; + if (textLength < this._charThreshold) { + parseSuccessful = false; + page.innerHTML = pageCacheHtml; + + if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { + this._removeFlag(this.FLAG_STRIP_UNLIKELYS); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { + this._removeFlag(this.FLAG_WEIGHT_CLASSES); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { + this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); + this._attempts.push({articleContent: articleContent, textLength: textLength}); + } else { + this._attempts.push({articleContent: articleContent, textLength: textLength}); + // No luck after removing flags, just return the longest text we found during the different loops + this._attempts.sort(function (a, b) { + return b.textLength - a.textLength; + }); + + // But first check if we actually have something + if (!this._attempts[0].textLength) { + return null; + } + + articleContent = this._attempts[0].articleContent; + parseSuccessful = true; + } + } + + if (parseSuccessful) { + // Find out text direction from ancestors of final top candidate. + var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); + this._someNode(ancestors, function(ancestor) { + if (!ancestor.tagName) + return false; + var articleDir = ancestor.getAttribute("dir"); + if (articleDir) { + this._articleDir = articleDir; + return true; + } + return false; + }); + return articleContent; + } + } + }, + + /** + * Check whether the input string could be a byline. + * This verifies that the input is a string, and that the length + * is less than 100 chars. + * + * @param possibleByline {string} - a string to check whether its a byline. + * @return Boolean - whether the input string is a byline. + */ + _isValidByline: function(byline) { + if (typeof byline == "string" || byline instanceof String) { + byline = byline.trim(); + return (byline.length > 0) && (byline.length < 100); + } + return false; + }, + + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var metadata; + + this._forEachNode(scripts, function(jsonLdElement) { + if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org$/) + ) { + return; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return; + } + + metadata = {}; + + if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { + // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz + // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either + // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. + + var title = this._getArticleTitle(); + var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; + var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; + + if (headlineMatches && !nameMatches) { + metadata.title = parsed.headline; + } else { + metadata.title = parsed.name; + } + } else if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author) { + if (typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + metadata.byline = parsed.author + .filter(function(author) { + return author && typeof author.name === "string"; + }) + .map(function(author) { + return author.name.trim(); + }) + .join(", "); + } + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + return; + } catch (err) { + this.log(err.message); + } + } + }); + return metadata ? metadata : {}; + }, + + /** + * Attempts to get excerpt and byline metadata for the article. + * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * + * @return Object with optional "excerpt" and "byline" properties + */ + _getArticleMetadata: function(jsonld) { + var metadata = {}; + var values = {}; + var metaElements = this._doc.getElementsByTagName("meta"); + + // property is a space-separated list of values + var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; + + // name is a single value + var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; + + // Find description tags. + this._forEachNode(metaElements, function(element) { + var elementName = element.getAttribute("name"); + var elementProperty = element.getAttribute("property"); + var content = element.getAttribute("content"); + if (!content) { + return; + } + var matches = null; + var name = null; + + if (elementProperty) { + matches = elementProperty.match(propertyPattern); + if (matches) { + // Convert to lowercase, and remove any whitespace + // so we can match below. + name = matches[0].toLowerCase().replace(/\s/g, ""); + // multiple authors + values[name] = content.trim(); + } + } + if (!matches && elementName && namePattern.test(elementName)) { + name = elementName; + if (content) { + // Convert to lowercase, remove any whitespace, and convert dots + // to colons so we can match below. + name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); + values[name] = content.trim(); + } + } + }); + + // get title + metadata.title = jsonld.title || + values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"]; + + if (!metadata.title) { + metadata.title = this._getArticleTitle(); + } + + // get author + metadata.byline = jsonld.byline || + values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; + + // get description + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; + + // get site name + metadata.siteName = jsonld.siteName || + values["og:site_name"]; + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); + + return metadata; + }, + + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all