summaryrefslogtreecommitdiffstats
path: root/toolkit/components/reader/readability/JSDOMParser.js
diff options
context:
space:
mode:
Diffstat (limited to 'toolkit/components/reader/readability/JSDOMParser.js')
-rw-r--r--toolkit/components/reader/readability/JSDOMParser.js1197
1 files changed, 1197 insertions, 0 deletions
diff --git a/toolkit/components/reader/readability/JSDOMParser.js b/toolkit/components/reader/readability/JSDOMParser.js
new file mode 100644
index 0000000000..7b6d241b43
--- /dev/null
+++ b/toolkit/components/reader/readability/JSDOMParser.js
@@ -0,0 +1,1197 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * This is a relatively lightweight DOMParser that is safe to use in a web
+ * worker. This is far from a complete DOM implementation; however, it should
+ * contain the minimal set of functionality necessary for Readability.js.
+ *
+ * Aside from not implementing the full DOM API, there are other quirks to be
+ * aware of when using the JSDOMParser:
+ *
+ * 1) Properly formed HTML/XML must be used. This means you should be extra
+ * careful when using this parser on anything received directly from an
+ * XMLHttpRequest. Providing a serialized string from an XMLSerializer,
+ * however, should be safe (since the browser's XMLSerializer should
+ * generate valid HTML/XML). Therefore, if parsing a document from an XHR,
+ * the recommended approach is to do the XHR in the main thread, use
+ * XMLSerializer.serializeToString() on the responseXML, and pass the
+ * resulting string to the worker.
+ *
+ * 2) Live NodeLists are not supported. DOM methods and properties such as
+ * getElementsByTagName() and childNodes return standard arrays. If you
+ * want these lists to be updated when nodes are removed or added to the
+ * document, you must take care to manually update them yourself.
+ */
+(function (global) {
+
+ // XML only defines these and the numeric ones:
+
+ var entityTable = {
+ "lt": "<",
+ "gt": ">",
+ "amp": "&",
+ "quot": '"',
+ "apos": "'",
+ };
+
+ var reverseEntityTable = {
+ "<": "&lt;",
+ ">": "&gt;",
+ "&": "&amp;",
+ '"': "&quot;",
+ "'": "&apos;",
+ };
+
+ function encodeTextContentHTML(s) {
+ return s.replace(/[&<>]/g, function(x) {
+ return reverseEntityTable[x];
+ });
+ }
+
+ function encodeHTML(s) {
+ return s.replace(/[&<>'"]/g, function(x) {
+ return reverseEntityTable[x];
+ });
+ }
+
+ function decodeHTML(str) {
+ return str.replace(/&(quot|amp|apos|lt|gt);/g, function(match, tag) {
+ return entityTable[tag];
+ }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(match, hex, numStr) {
+ var num = parseInt(hex || numStr, hex ? 16 : 10); // read num
+ return String.fromCharCode(num);
+ });
+ }
+
+ // When a style is set in JS, map it to the corresponding CSS attribute
+ var styleMap = {
+ "alignmentBaseline": "alignment-baseline",
+ "background": "background",
+ "backgroundAttachment": "background-attachment",
+ "backgroundClip": "background-clip",
+ "backgroundColor": "background-color",
+ "backgroundImage": "background-image",
+ "backgroundOrigin": "background-origin",
+ "backgroundPosition": "background-position",
+ "backgroundPositionX": "background-position-x",
+ "backgroundPositionY": "background-position-y",
+ "backgroundRepeat": "background-repeat",
+ "backgroundRepeatX": "background-repeat-x",
+ "backgroundRepeatY": "background-repeat-y",
+ "backgroundSize": "background-size",
+ "baselineShift": "baseline-shift",
+ "border": "border",
+ "borderBottom": "border-bottom",
+ "borderBottomColor": "border-bottom-color",
+ "borderBottomLeftRadius": "border-bottom-left-radius",
+ "borderBottomRightRadius": "border-bottom-right-radius",
+ "borderBottomStyle": "border-bottom-style",
+ "borderBottomWidth": "border-bottom-width",
+ "borderCollapse": "border-collapse",
+ "borderColor": "border-color",
+ "borderImage": "border-image",
+ "borderImageOutset": "border-image-outset",
+ "borderImageRepeat": "border-image-repeat",
+ "borderImageSlice": "border-image-slice",
+ "borderImageSource": "border-image-source",
+ "borderImageWidth": "border-image-width",
+ "borderLeft": "border-left",
+ "borderLeftColor": "border-left-color",
+ "borderLeftStyle": "border-left-style",
+ "borderLeftWidth": "border-left-width",
+ "borderRadius": "border-radius",
+ "borderRight": "border-right",
+ "borderRightColor": "border-right-color",
+ "borderRightStyle": "border-right-style",
+ "borderRightWidth": "border-right-width",
+ "borderSpacing": "border-spacing",
+ "borderStyle": "border-style",
+ "borderTop": "border-top",
+ "borderTopColor": "border-top-color",
+ "borderTopLeftRadius": "border-top-left-radius",
+ "borderTopRightRadius": "border-top-right-radius",
+ "borderTopStyle": "border-top-style",
+ "borderTopWidth": "border-top-width",
+ "borderWidth": "border-width",
+ "bottom": "bottom",
+ "boxShadow": "box-shadow",
+ "boxSizing": "box-sizing",
+ "captionSide": "caption-side",
+ "clear": "clear",
+ "clip": "clip",
+ "clipPath": "clip-path",
+ "clipRule": "clip-rule",
+ "color": "color",
+ "colorInterpolation": "color-interpolation",
+ "colorInterpolationFilters": "color-interpolation-filters",
+ "colorProfile": "color-profile",
+ "colorRendering": "color-rendering",
+ "content": "content",
+ "counterIncrement": "counter-increment",
+ "counterReset": "counter-reset",
+ "cursor": "cursor",
+ "direction": "direction",
+ "display": "display",
+ "dominantBaseline": "dominant-baseline",
+ "emptyCells": "empty-cells",
+ "enableBackground": "enable-background",
+ "fill": "fill",
+ "fillOpacity": "fill-opacity",
+ "fillRule": "fill-rule",
+ "filter": "filter",
+ "cssFloat": "float",
+ "floodColor": "flood-color",
+ "floodOpacity": "flood-opacity",
+ "font": "font",
+ "fontFamily": "font-family",
+ "fontSize": "font-size",
+ "fontStretch": "font-stretch",
+ "fontStyle": "font-style",
+ "fontVariant": "font-variant",
+ "fontWeight": "font-weight",
+ "glyphOrientationHorizontal": "glyph-orientation-horizontal",
+ "glyphOrientationVertical": "glyph-orientation-vertical",
+ "height": "height",
+ "imageRendering": "image-rendering",
+ "kerning": "kerning",
+ "left": "left",
+ "letterSpacing": "letter-spacing",
+ "lightingColor": "lighting-color",
+ "lineHeight": "line-height",
+ "listStyle": "list-style",
+ "listStyleImage": "list-style-image",
+ "listStylePosition": "list-style-position",
+ "listStyleType": "list-style-type",
+ "margin": "margin",
+ "marginBottom": "margin-bottom",
+ "marginLeft": "margin-left",
+ "marginRight": "margin-right",
+ "marginTop": "margin-top",
+ "marker": "marker",
+ "markerEnd": "marker-end",
+ "markerMid": "marker-mid",
+ "markerStart": "marker-start",
+ "mask": "mask",
+ "maxHeight": "max-height",
+ "maxWidth": "max-width",
+ "minHeight": "min-height",
+ "minWidth": "min-width",
+ "opacity": "opacity",
+ "orphans": "orphans",
+ "outline": "outline",
+ "outlineColor": "outline-color",
+ "outlineOffset": "outline-offset",
+ "outlineStyle": "outline-style",
+ "outlineWidth": "outline-width",
+ "overflow": "overflow",
+ "overflowX": "overflow-x",
+ "overflowY": "overflow-y",
+ "padding": "padding",
+ "paddingBottom": "padding-bottom",
+ "paddingLeft": "padding-left",
+ "paddingRight": "padding-right",
+ "paddingTop": "padding-top",
+ "page": "page",
+ "pageBreakAfter": "page-break-after",
+ "pageBreakBefore": "page-break-before",
+ "pageBreakInside": "page-break-inside",
+ "pointerEvents": "pointer-events",
+ "position": "position",
+ "quotes": "quotes",
+ "resize": "resize",
+ "right": "right",
+ "shapeRendering": "shape-rendering",
+ "size": "size",
+ "speak": "speak",
+ "src": "src",
+ "stopColor": "stop-color",
+ "stopOpacity": "stop-opacity",
+ "stroke": "stroke",
+ "strokeDasharray": "stroke-dasharray",
+ "strokeDashoffset": "stroke-dashoffset",
+ "strokeLinecap": "stroke-linecap",
+ "strokeLinejoin": "stroke-linejoin",
+ "strokeMiterlimit": "stroke-miterlimit",
+ "strokeOpacity": "stroke-opacity",
+ "strokeWidth": "stroke-width",
+ "tableLayout": "table-layout",
+ "textAlign": "text-align",
+ "textAnchor": "text-anchor",
+ "textDecoration": "text-decoration",
+ "textIndent": "text-indent",
+ "textLineThrough": "text-line-through",
+ "textLineThroughColor": "text-line-through-color",
+ "textLineThroughMode": "text-line-through-mode",
+ "textLineThroughStyle": "text-line-through-style",
+ "textLineThroughWidth": "text-line-through-width",
+ "textOverflow": "text-overflow",
+ "textOverline": "text-overline",
+ "textOverlineColor": "text-overline-color",
+ "textOverlineMode": "text-overline-mode",
+ "textOverlineStyle": "text-overline-style",
+ "textOverlineWidth": "text-overline-width",
+ "textRendering": "text-rendering",
+ "textShadow": "text-shadow",
+ "textTransform": "text-transform",
+ "textUnderline": "text-underline",
+ "textUnderlineColor": "text-underline-color",
+ "textUnderlineMode": "text-underline-mode",
+ "textUnderlineStyle": "text-underline-style",
+ "textUnderlineWidth": "text-underline-width",
+ "top": "top",
+ "unicodeBidi": "unicode-bidi",
+ "unicodeRange": "unicode-range",
+ "vectorEffect": "vector-effect",
+ "verticalAlign": "vertical-align",
+ "visibility": "visibility",
+ "whiteSpace": "white-space",
+ "widows": "widows",
+ "width": "width",
+ "wordBreak": "word-break",
+ "wordSpacing": "word-spacing",
+ "wordWrap": "word-wrap",
+ "writingMode": "writing-mode",
+ "zIndex": "z-index",
+ "zoom": "zoom",
+ };
+
+ // Elements that can be self-closing
+ var voidElems = {
+ "area": true,
+ "base": true,
+ "br": true,
+ "col": true,
+ "command": true,
+ "embed": true,
+ "hr": true,
+ "img": true,
+ "input": true,
+ "link": true,
+ "meta": true,
+ "param": true,
+ "source": true,
+ "wbr": true
+ };
+
+ var whitespace = [" ", "\t", "\n", "\r"];
+
+ // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+ var nodeTypes = {
+ ELEMENT_NODE: 1,
+ ATTRIBUTE_NODE: 2,
+ TEXT_NODE: 3,
+ CDATA_SECTION_NODE: 4,
+ ENTITY_REFERENCE_NODE: 5,
+ ENTITY_NODE: 6,
+ PROCESSING_INSTRUCTION_NODE: 7,
+ COMMENT_NODE: 8,
+ DOCUMENT_NODE: 9,
+ DOCUMENT_TYPE_NODE: 10,
+ DOCUMENT_FRAGMENT_NODE: 11,
+ NOTATION_NODE: 12
+ };
+
+ function getElementsByTagName(tag) {
+ tag = tag.toUpperCase();
+ var elems = [];
+ var allTags = (tag === "*");
+ function getElems(node) {
+ var length = node.children.length;
+ for (var i = 0; i < length; i++) {
+ var child = node.children[i];
+ if (allTags || (child.tagName === tag))
+ elems.push(child);
+ getElems(child);
+ }
+ }
+ getElems(this);
+ elems._isLiveNodeList = true;
+ return elems;
+ }
+
+ var Node = function () {};
+
+ Node.prototype = {
+ attributes: null,
+ childNodes: null,
+ localName: null,
+ nodeName: null,
+ parentNode: null,
+ textContent: null,
+ nextSibling: null,
+ previousSibling: null,
+
+ get firstChild() {
+ return this.childNodes[0] || null;
+ },
+
+ get firstElementChild() {
+ return this.children[0] || null;
+ },
+
+ get lastChild() {
+ return this.childNodes[this.childNodes.length - 1] || null;
+ },
+
+ get lastElementChild() {
+ return this.children[this.children.length - 1] || null;
+ },
+
+ appendChild: function (child) {
+ if (child.parentNode) {
+ child.parentNode.removeChild(child);
+ }
+
+ var last = this.lastChild;
+ if (last)
+ last.nextSibling = child;
+ child.previousSibling = last;
+
+ if (child.nodeType === Node.ELEMENT_NODE) {
+ child.previousElementSibling = this.children[this.children.length - 1] || null;
+ this.children.push(child);
+ child.previousElementSibling && (child.previousElementSibling.nextElementSibling = child);
+ }
+ this.childNodes.push(child);
+ child.parentNode = this;
+ },
+
+ removeChild: function (child) {
+ var childNodes = this.childNodes;
+ var childIndex = childNodes.indexOf(child);
+ if (childIndex === -1) {
+ throw "removeChild: node not found";
+ } else {
+ child.parentNode = null;
+ var prev = child.previousSibling;
+ var next = child.nextSibling;
+ if (prev)
+ prev.nextSibling = next;
+ if (next)
+ next.previousSibling = prev;
+
+ if (child.nodeType === Node.ELEMENT_NODE) {
+ prev = child.previousElementSibling;
+ next = child.nextElementSibling;
+ if (prev)
+ prev.nextElementSibling = next;
+ if (next)
+ next.previousElementSibling = prev;
+ this.children.splice(this.children.indexOf(child), 1);
+ }
+
+ child.previousSibling = child.nextSibling = null;
+ child.previousElementSibling = child.nextElementSibling = null;
+
+ return childNodes.splice(childIndex, 1)[0];
+ }
+ },
+
+ replaceChild: function (newNode, oldNode) {
+ var childNodes = this.childNodes;
+ var childIndex = childNodes.indexOf(oldNode);
+ if (childIndex === -1) {
+ throw "replaceChild: node not found";
+ } else {
+ // This will take care of updating the new node if it was somewhere else before:
+ if (newNode.parentNode)
+ newNode.parentNode.removeChild(newNode);
+
+ childNodes[childIndex] = newNode;
+
+ // update the new node's sibling properties, and its new siblings' sibling properties
+ newNode.nextSibling = oldNode.nextSibling;
+ newNode.previousSibling = oldNode.previousSibling;
+ if (newNode.nextSibling)
+ newNode.nextSibling.previousSibling = newNode;
+ if (newNode.previousSibling)
+ newNode.previousSibling.nextSibling = newNode;
+
+ newNode.parentNode = this;
+
+ // Now deal with elements before we clear out those values for the old node,
+ // because it can help us take shortcuts here:
+ if (newNode.nodeType === Node.ELEMENT_NODE) {
+ if (oldNode.nodeType === Node.ELEMENT_NODE) {
+ // Both were elements, which makes this easier, we just swap things out:
+ newNode.previousElementSibling = oldNode.previousElementSibling;
+ newNode.nextElementSibling = oldNode.nextElementSibling;
+ if (newNode.previousElementSibling)
+ newNode.previousElementSibling.nextElementSibling = newNode;
+ if (newNode.nextElementSibling)
+ newNode.nextElementSibling.previousElementSibling = newNode;
+ this.children[this.children.indexOf(oldNode)] = newNode;
+ } else {
+ // Hard way:
+ newNode.previousElementSibling = (function() {
+ for (var i = childIndex - 1; i >= 0; i--) {
+ if (childNodes[i].nodeType === Node.ELEMENT_NODE)
+ return childNodes[i];
+ }
+ return null;
+ })();
+ if (newNode.previousElementSibling) {
+ newNode.nextElementSibling = newNode.previousElementSibling.nextElementSibling;
+ } else {
+ newNode.nextElementSibling = (function() {
+ for (var i = childIndex + 1; i < childNodes.length; i++) {
+ if (childNodes[i].nodeType === Node.ELEMENT_NODE)
+ return childNodes[i];
+ }
+ return null;
+ })();
+ }
+ if (newNode.previousElementSibling)
+ newNode.previousElementSibling.nextElementSibling = newNode;
+ if (newNode.nextElementSibling)
+ newNode.nextElementSibling.previousElementSibling = newNode;
+
+ if (newNode.nextElementSibling)
+ this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
+ else
+ this.children.push(newNode);
+ }
+ } else if (oldNode.nodeType === Node.ELEMENT_NODE) {
+ // new node is not an element node.
+ // if the old one was, update its element siblings:
+ if (oldNode.previousElementSibling)
+ oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
+ if (oldNode.nextElementSibling)
+ oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
+ this.children.splice(this.children.indexOf(oldNode), 1);
+
+ // If the old node wasn't an element, neither the new nor the old node was an element,
+ // and the children array and its members shouldn't need any updating.
+ }
+
+
+ oldNode.parentNode = null;
+ oldNode.previousSibling = null;
+ oldNode.nextSibling = null;
+ if (oldNode.nodeType === Node.ELEMENT_NODE) {
+ oldNode.previousElementSibling = null;
+ oldNode.nextElementSibling = null;
+ }
+ return oldNode;
+ }
+ },
+
+ __JSDOMParser__: true,
+ };
+
+ for (var nodeType in nodeTypes) {
+ Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType];
+ }
+
+ var Attribute = function (name, value) {
+ this.name = name;
+ this._value = value;
+ };
+
+ Attribute.prototype = {
+ get value() {
+ return this._value;
+ },
+ setValue: function(newValue) {
+ this._value = newValue;
+ },
+ getEncodedValue: function() {
+ return encodeHTML(this._value);
+ },
+ };
+
+ var Comment = function () {
+ this.childNodes = [];
+ };
+
+ Comment.prototype = {
+ __proto__: Node.prototype,
+
+ nodeName: "#comment",
+ nodeType: Node.COMMENT_NODE
+ };
+
+ var Text = function () {
+ this.childNodes = [];
+ };
+
+ Text.prototype = {
+ __proto__: Node.prototype,
+
+ nodeName: "#text",
+ nodeType: Node.TEXT_NODE,
+ get textContent() {
+ if (typeof this._textContent === "undefined") {
+ this._textContent = decodeHTML(this._innerHTML || "");
+ }
+ return this._textContent;
+ },
+ get innerHTML() {
+ if (typeof this._innerHTML === "undefined") {
+ this._innerHTML = encodeTextContentHTML(this._textContent || "");
+ }
+ return this._innerHTML;
+ },
+
+ set innerHTML(newHTML) {
+ this._innerHTML = newHTML;
+ delete this._textContent;
+ },
+ set textContent(newText) {
+ this._textContent = newText;
+ delete this._innerHTML;
+ },
+ };
+
+ var Document = function (url) {
+ this.documentURI = url;
+ this.styleSheets = [];
+ this.childNodes = [];
+ this.children = [];
+ };
+
+ Document.prototype = {
+ __proto__: Node.prototype,
+
+ nodeName: "#document",
+ nodeType: Node.DOCUMENT_NODE,
+ title: "",
+
+ getElementsByTagName: getElementsByTagName,
+
+ getElementById: function (id) {
+ function getElem(node) {
+ var length = node.children.length;
+ if (node.id === id)
+ return node;
+ for (var i = 0; i < length; i++) {
+ var el = getElem(node.children[i]);
+ if (el)
+ return el;
+ }
+ return null;
+ }
+ return getElem(this);
+ },
+
+ createElement: function (tag) {
+ var node = new Element(tag);
+ return node;
+ },
+
+ createTextNode: function (text) {
+ var node = new Text();
+ node.textContent = text;
+ return node;
+ },
+
+ get baseURI() {
+ if (!this.hasOwnProperty("_baseURI")) {
+ this._baseURI = this.documentURI;
+ var baseElements = this.getElementsByTagName("base");
+ var href = baseElements[0] && baseElements[0].getAttribute("href");
+ if (href) {
+ try {
+ this._baseURI = (new URL(href, this._baseURI)).href;
+ } catch (ex) {/* Just fall back to documentURI */}
+ }
+ }
+ return this._baseURI;
+ },
+ };
+
+ var Element = function (tag) {
+ // We use this to find the closing tag.
+ this._matchingTag = tag;
+ // We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
+ var lastColonIndex = tag.lastIndexOf(":");
+ if (lastColonIndex != -1) {
+ tag = tag.substring(lastColonIndex + 1);
+ }
+ this.attributes = [];
+ this.childNodes = [];
+ this.children = [];
+ this.nextElementSibling = this.previousElementSibling = null;
+ this.localName = tag.toLowerCase();
+ this.tagName = tag.toUpperCase();
+ this.style = new Style(this);
+ };
+
+ Element.prototype = {
+ __proto__: Node.prototype,
+
+ nodeType: Node.ELEMENT_NODE,
+
+ getElementsByTagName: getElementsByTagName,
+
+ get className() {
+ return this.getAttribute("class") || "";
+ },
+
+ set className(str) {
+ this.setAttribute("class", str);
+ },
+
+ get id() {
+ return this.getAttribute("id") || "";
+ },
+
+ set id(str) {
+ this.setAttribute("id", str);
+ },
+
+ get href() {
+ return this.getAttribute("href") || "";
+ },
+
+ set href(str) {
+ this.setAttribute("href", str);
+ },
+
+ get src() {
+ return this.getAttribute("src") || "";
+ },
+
+ set src(str) {
+ this.setAttribute("src", str);
+ },
+
+ get srcset() {
+ return this.getAttribute("srcset") || "";
+ },
+
+ set srcset(str) {
+ this.setAttribute("srcset", str);
+ },
+
+ get nodeName() {
+ return this.tagName;
+ },
+
+ get innerHTML() {
+ function getHTML(node) {
+ var i = 0;
+ for (i = 0; i < node.childNodes.length; i++) {
+ var child = node.childNodes[i];
+ if (child.localName) {
+ arr.push("<" + child.localName);
+
+ // serialize attribute list
+ for (var j = 0; j < child.attributes.length; j++) {
+ var attr = child.attributes[j];
+ // the attribute value will be HTML escaped.
+ var val = attr.getEncodedValue();
+ var quote = (val.indexOf('"') === -1 ? '"' : "'");
+ arr.push(" " + attr.name + "=" + quote + val + quote);
+ }
+
+ if (child.localName in voidElems && !child.childNodes.length) {
+ // if this is a self-closing element, end it here
+ arr.push("/>");
+ } else {
+ // otherwise, add its children
+ arr.push(">");
+ getHTML(child);
+ arr.push("</" + child.localName + ">");
+ }
+ } else {
+ // This is a text node, so asking for innerHTML won't recurse.
+ arr.push(child.innerHTML);
+ }
+ }
+ }
+
+ // Using Array.join() avoids the overhead from lazy string concatenation.
+ var arr = [];
+ getHTML(this);
+ return arr.join("");
+ },
+
+ set innerHTML(html) {
+ var parser = new JSDOMParser();
+ var node = parser.parse(html);
+ var i;
+ for (i = this.childNodes.length; --i >= 0;) {
+ this.childNodes[i].parentNode = null;
+ }
+ this.childNodes = node.childNodes;
+ this.children = node.children;
+ for (i = this.childNodes.length; --i >= 0;) {
+ this.childNodes[i].parentNode = this;
+ }
+ },
+
+ set textContent(text) {
+ // clear parentNodes for existing children
+ for (var i = this.childNodes.length; --i >= 0;) {
+ this.childNodes[i].parentNode = null;
+ }
+
+ var node = new Text();
+ this.childNodes = [ node ];
+ this.children = [];
+ node.textContent = text;
+ node.parentNode = this;
+ },
+
+ get textContent() {
+ function getText(node) {
+ var nodes = node.childNodes;
+ for (var i = 0; i < nodes.length; i++) {
+ var child = nodes[i];
+ if (child.nodeType === 3) {
+ text.push(child.textContent);
+ } else {
+ getText(child);
+ }
+ }
+ }
+
+ // Using Array.join() avoids the overhead from lazy string concatenation.
+ // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
+ var text = [];
+ getText(this);
+ return text.join("");
+ },
+
+ getAttribute: function (name) {
+ for (var i = this.attributes.length; --i >= 0;) {
+ var attr = this.attributes[i];
+ if (attr.name === name) {
+ return attr.value;
+ }
+ }
+ return undefined;
+ },
+
+ setAttribute: function (name, value) {
+ for (var i = this.attributes.length; --i >= 0;) {
+ var attr = this.attributes[i];
+ if (attr.name === name) {
+ attr.setValue(value);
+ return;
+ }
+ }
+ this.attributes.push(new Attribute(name, value));
+ },
+
+ removeAttribute: function (name) {
+ for (var i = this.attributes.length; --i >= 0;) {
+ var attr = this.attributes[i];
+ if (attr.name === name) {
+ this.attributes.splice(i, 1);
+ break;
+ }
+ }
+ },
+
+ hasAttribute: function (name) {
+ return this.attributes.some(function (attr) {
+ return attr.name == name;
+ });
+ },
+ };
+
+ var Style = function (node) {
+ this.node = node;
+ };
+
+ // getStyle() and setStyle() use the style attribute string directly. This
+ // won't be very efficient if there are a lot of style manipulations, but
+ // it's the easiest way to make sure the style attribute string and the JS
+ // style property stay in sync. Readability.js doesn't do many style
+ // manipulations, so this should be okay.
+ Style.prototype = {
+ getStyle: function (styleName) {
+ var attr = this.node.getAttribute("style");
+ if (!attr)
+ return undefined;
+
+ var styles = attr.split(";");
+ for (var i = 0; i < styles.length; i++) {
+ var style = styles[i].split(":");
+ var name = style[0].trim();
+ if (name === styleName)
+ return style[1].trim();
+ }
+
+ return undefined;
+ },
+
+ setStyle: function (styleName, styleValue) {
+ var value = this.node.getAttribute("style") || "";
+ var index = 0;
+ do {
+ var next = value.indexOf(";", index) + 1;
+ var length = next - index - 1;
+ var style = (length > 0 ? value.substr(index, length) : value.substr(index));
+ if (style.substr(0, style.indexOf(":")).trim() === styleName) {
+ value = value.substr(0, index).trim() + (next ? " " + value.substr(next).trim() : "");
+ break;
+ }
+ index = next;
+ } while (index);
+
+ value += " " + styleName + ": " + styleValue + ";";
+ this.node.setAttribute("style", value.trim());
+ }
+ };
+
+ // For each item in styleMap, define a getter and setter on the style
+ // property.
+ for (var jsName in styleMap) {
+ (function (cssName) {
+ Style.prototype.__defineGetter__(jsName, function () {
+ return this.getStyle(cssName);
+ });
+ Style.prototype.__defineSetter__(jsName, function (value) {
+ this.setStyle(cssName, value);
+ });
+ })(styleMap[jsName]);
+ }
+
+ var JSDOMParser = function () {
+ this.currentChar = 0;
+
+ // In makeElementNode() we build up many strings one char at a time. Using
+ // += for this results in lots of short-lived intermediate strings. It's
+ // better to build an array of single-char strings and then join() them
+ // together at the end. And reusing a single array (i.e. |this.strBuf|)
+ // over and over for this purpose uses less memory than using a new array
+ // for each string.
+ this.strBuf = [];
+
+ // Similarly, we reuse this array to return the two arguments from
+ // makeElementNode(), which saves us from having to allocate a new array
+ // every time.
+ this.retPair = [];
+
+ this.errorState = "";
+ };
+
+ JSDOMParser.prototype = {
+ error: function(m) {
+ if (typeof console !== "undefined") {
+ console.log("JSDOMParser error: " + m + "\n");
+ } else if (typeof dump !== "undefined") {
+ /* global dump */
+ dump("JSDOMParser error: " + m + "\n");
+ }
+ this.errorState += m + "\n";
+ },
+
+ /**
+ * Look at the next character without advancing the index.
+ */
+ peekNext: function () {
+ return this.html[this.currentChar];
+ },
+
+ /**
+ * Get the next character and advance the index.
+ */
+ nextChar: function () {
+ return this.html[this.currentChar++];
+ },
+
+ /**
+ * Called after a quote character is read. This finds the next quote
+ * character and returns the text string in between.
+ */
+ readString: function (quote) {
+ var str;
+ var n = this.html.indexOf(quote, this.currentChar);
+ if (n === -1) {
+ this.currentChar = this.html.length;
+ str = null;
+ } else {
+ str = this.html.substring(this.currentChar, n);
+ this.currentChar = n + 1;
+ }
+
+ return str;
+ },
+
+ /**
+ * Called when parsing a node. This finds the next name/value attribute
+ * pair and adds the result to the attributes list.
+ */
+ readAttribute: function (node) {
+ var name = "";
+
+ var n = this.html.indexOf("=", this.currentChar);
+ if (n === -1) {
+ this.currentChar = this.html.length;
+ } else {
+ // Read until a '=' character is hit; this will be the attribute key
+ name = this.html.substring(this.currentChar, n);
+ this.currentChar = n + 1;
+ }
+
+ if (!name)
+ return;
+
+ // After a '=', we should see a '"' for the attribute value
+ var c = this.nextChar();
+ if (c !== '"' && c !== "'") {
+ this.error("Error reading attribute " + name + ", expecting '\"'");
+ return;
+ }
+
+ // Read the attribute value (and consume the matching quote)
+ var value = this.readString(c);
+
+ node.attributes.push(new Attribute(name, decodeHTML(value)));
+
+ return;
+ },
+
+ /**
+ * Parses and returns an Element node. This is called after a '<' has been
+ * read.
+ *
+ * @returns an array; the first index of the array is the parsed node;
+ * the second index is a boolean indicating whether this is a void
+ * Element
+ */
+ makeElementNode: function (retPair) {
+ var c = this.nextChar();
+
+ // Read the Element tag name
+ var strBuf = this.strBuf;
+ strBuf.length = 0;
+ while (whitespace.indexOf(c) == -1 && c !== ">" && c !== "/") {
+ if (c === undefined)
+ return false;
+ strBuf.push(c);
+ c = this.nextChar();
+ }
+ var tag = strBuf.join("");
+
+ if (!tag)
+ return false;
+
+ var node = new Element(tag);
+
+ // Read Element attributes
+ while (c !== "/" && c !== ">") {
+ if (c === undefined)
+ return false;
+ while (whitespace.indexOf(this.html[this.currentChar++]) != -1) {
+ // Advance cursor to first non-whitespace char.
+ }
+ this.currentChar--;
+ c = this.nextChar();
+ if (c !== "/" && c !== ">") {
+ --this.currentChar;
+ this.readAttribute(node);
+ }
+ }
+
+ // If this is a self-closing tag, read '/>'
+ var closed = false;
+ if (c === "/") {
+ closed = true;
+ c = this.nextChar();
+ if (c !== ">") {
+ this.error("expected '>' to close " + tag);
+ return false;
+ }
+ }
+
+ retPair[0] = node;
+ retPair[1] = closed;
+ return true;
+ },
+
+ /**
+ * If the current input matches this string, advance the input index;
+ * otherwise, do nothing.
+ *
+ * @returns whether input matched string
+ */
+ match: function (str) {
+ var strlen = str.length;
+ if (this.html.substr(this.currentChar, strlen).toLowerCase() === str.toLowerCase()) {
+ this.currentChar += strlen;
+ return true;
+ }
+ return false;
+ },
+
+ /**
+ * Searches the input until a string is found and discards all input up to
+ * and including the matched string.
+ */
+ discardTo: function (str) {
+ var index = this.html.indexOf(str, this.currentChar) + str.length;
+ if (index === -1)
+ this.currentChar = this.html.length;
+ this.currentChar = index;
+ },
+
+ /**
+ * Reads child nodes for the given node.
+ */
+ readChildren: function (node) {
+ var child;
+ while ((child = this.readNode())) {
+ // Don't keep Comment nodes
+ if (child.nodeType !== 8) {
+ node.appendChild(child);
+ }
+ }
+ },
+
+ discardNextComment: function() {
+ if (this.match("--")) {
+ this.discardTo("-->");
+ } else {
+ var c = this.nextChar();
+ while (c !== ">") {
+ if (c === undefined)
+ return null;
+ if (c === '"' || c === "'")
+ this.readString(c);
+ c = this.nextChar();
+ }
+ }
+ return new Comment();
+ },
+
+
+ /**
+ * Reads the next child node from the input. If we're reading a closing
+ * tag, or if we've reached the end of input, return null.
+ *
+ * @returns the node
+ */
+ readNode: function () {
+ var c = this.nextChar();
+
+ if (c === undefined)
+ return null;
+
+ // Read any text as Text node
+ var textNode;
+ if (c !== "<") {
+ --this.currentChar;
+ textNode = new Text();
+ var n = this.html.indexOf("<", this.currentChar);
+ if (n === -1) {
+ textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
+ this.currentChar = this.html.length;
+ } else {
+ textNode.innerHTML = this.html.substring(this.currentChar, n);
+ this.currentChar = n;
+ }
+ return textNode;
+ }
+
+ if (this.match("![CDATA[")) {
+ var endChar = this.html.indexOf("]]>", this.currentChar);
+ if (endChar === -1) {
+ this.error("unclosed CDATA section");
+ return null;
+ }
+ textNode = new Text();
+ textNode.textContent = this.html.substring(this.currentChar, endChar);
+ this.currentChar = endChar + ("]]>").length;
+ return textNode;
+ }
+
+ c = this.peekNext();
+
+ // Read Comment node. Normally, Comment nodes know their inner
+ // textContent, but we don't really care about Comment nodes (we throw
+ // them away in readChildren()). So just returning an empty Comment node
+ // here is sufficient.
+ if (c === "!" || c === "?") {
+ // We're still before the ! or ? that is starting this comment:
+ this.currentChar++;
+ return this.discardNextComment();
+ }
+
+ // If we're reading a closing tag, return null. This means we've reached
+ // the end of this set of child nodes.
+ if (c === "/") {
+ --this.currentChar;
+ return null;
+ }
+
+ // Otherwise, we're looking at an Element node
+ var result = this.makeElementNode(this.retPair);
+ if (!result)
+ return null;
+
+ var node = this.retPair[0];
+ var closed = this.retPair[1];
+ var localName = node.localName;
+
+ // If this isn't a void Element, read its child nodes
+ if (!closed) {
+ this.readChildren(node);
+ var closingTag = "</" + node._matchingTag + ">";
+ if (!this.match(closingTag)) {
+ this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
+ return null;
+ }
+ }
+
+ // Only use the first title, because SVG might have other
+ // title elements which we don't care about (medium.com
+ // does this, at least).
+ if (localName === "title" && !this.doc.title) {
+ this.doc.title = node.textContent.trim();
+ } else if (localName === "head") {
+ this.doc.head = node;
+ } else if (localName === "body") {
+ this.doc.body = node;
+ } else if (localName === "html") {
+ this.doc.documentElement = node;
+ }
+
+ return node;
+ },
+
+ /**
+ * Parses an HTML string and returns a JS implementation of the Document.
+ */
+ parse: function (html, url) {
+ this.html = html;
+ var doc = this.doc = new Document(url);
+ this.readChildren(doc);
+
+ // If this is an HTML document, remove root-level children except for the
+ // <html> node
+ if (doc.documentElement) {
+ for (var i = doc.childNodes.length; --i >= 0;) {
+ var child = doc.childNodes[i];
+ if (child !== doc.documentElement) {
+ doc.removeChild(child);
+ }
+ }
+ }
+
+ return doc;
+ }
+ };
+
+ // Attach the standard DOM types to the global scope
+ global.Node = Node;
+ global.Comment = Comment;
+ global.Document = Document;
+ global.Element = Element;
+ global.Text = Text;
+
+ // Attach JSDOMParser to the global scope
+ global.JSDOMParser = JSDOMParser;
+
+})(this);
+
+if (typeof module === "object") {
+ /* global module */
+ module.exports = this.JSDOMParser;
+}