diff options
Diffstat (limited to 'comm/mailnews/extensions/newsblog/FeedParser.jsm')
-rw-r--r-- | comm/mailnews/extensions/newsblog/FeedParser.jsm | 1496 |
1 files changed, 1496 insertions, 0 deletions
diff --git a/comm/mailnews/extensions/newsblog/FeedParser.jsm b/comm/mailnews/extensions/newsblog/FeedParser.jsm new file mode 100644 index 0000000000..863d5789fe --- /dev/null +++ b/comm/mailnews/extensions/newsblog/FeedParser.jsm @@ -0,0 +1,1496 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const EXPORTED_SYMBOLS = ["FeedParser"]; + +const lazy = {}; + +ChromeUtils.defineModuleGetter( + lazy, + "FeedItem", + "resource:///modules/FeedItem.jsm" +); +ChromeUtils.defineModuleGetter( + lazy, + "FeedEnclosure", + "resource:///modules/FeedItem.jsm" +); +ChromeUtils.defineModuleGetter( + lazy, + "FeedUtils", + "resource:///modules/FeedUtils.jsm" +); + +/** + * The feed parser. Depends on FeedItem.js, Feed.js. + * + * @class + */ +function FeedParser() { + this.parsedItems = []; + this.mSerializer = new XMLSerializer(); +} + +FeedParser.prototype = { + /** + * parseFeed() returns an array of parsed items ready for processing. It is + * currently a synchronous operation. If there is an error parsing the feed, + * parseFeed returns an empty feed in addition to calling aFeed.onParseError. + * + * @param {Feed} aFeed - The Feed object. + * @param {XMLDocument} aDOM - The document to parse. + * @returns {Array} - array of items, or empty array for error returns or + * nothing to do condition. + */ + parseFeed(aFeed, aDOM) { + if (!XMLDocument.isInstance(aDOM)) { + // No xml doc. + aFeed.onParseError(aFeed); + return []; + } + + let doc = aDOM.documentElement; + if (doc.namespaceURI == lazy.FeedUtils.MOZ_PARSERERROR_NS) { + // Gecko caught a basic parsing error. + let errStr = + doc.firstChild.textContent + "\n" + doc.firstElementChild.textContent; + lazy.FeedUtils.log.info("FeedParser.parseFeed: - " + errStr); + aFeed.onParseError(aFeed); + return []; + } else if (aDOM.querySelector("redirect")) { + // Check for RSS2.0 redirect document. + let channel = aDOM.querySelector("redirect"); + if (this.isPermanentRedirect(aFeed, channel, null)) { + return []; + } + + aFeed.onParseError(aFeed); + return []; + } else if ( + doc.namespaceURI == lazy.FeedUtils.RDF_SYNTAX_NS && + doc.getElementsByTagNameNS(lazy.FeedUtils.RSS_NS, "channel")[0] + ) { + aFeed.mFeedType = "RSS_1.xRDF"; + lazy.FeedUtils.log.debug( + "FeedParser.parseFeed: type:url - " + + aFeed.mFeedType + + " : " + + aFeed.url + ); + + return this.parseAsRSS1(aFeed, aDOM); + } else if (doc.namespaceURI == lazy.FeedUtils.ATOM_03_NS) { + aFeed.mFeedType = "ATOM_0.3"; + lazy.FeedUtils.log.debug( + "FeedParser.parseFeed: type:url - " + + aFeed.mFeedType + + " : " + + aFeed.url + ); + return this.parseAsAtom(aFeed, aDOM); + } else if (doc.namespaceURI == lazy.FeedUtils.ATOM_IETF_NS) { + aFeed.mFeedType = "ATOM_IETF"; + lazy.FeedUtils.log.debug( + "FeedParser.parseFeed: type:url - " + + aFeed.mFeedType + + " : " + + aFeed.url + ); + return this.parseAsAtomIETF(aFeed, aDOM); + } else if ( + doc.getElementsByTagNameNS(lazy.FeedUtils.RSS_090_NS, "channel")[0] + ) { + aFeed.mFeedType = "RSS_0.90"; + lazy.FeedUtils.log.debug( + "FeedParser.parseFeed: type:url - " + + aFeed.mFeedType + + " : " + + aFeed.url + ); + return this.parseAsRSS2(aFeed, aDOM); + } + + // Parse as RSS 0.9x. In theory even RSS 1.0 feeds could be parsed by + // the 0.9x parser if the RSS namespace were the default. + let rssVer = doc.localName == "rss" ? doc.getAttribute("version") : null; + if (rssVer) { + aFeed.mFeedType = "RSS_" + rssVer; + } else { + aFeed.mFeedType = "RSS_0.9x?"; + } + lazy.FeedUtils.log.debug( + "FeedParser.parseFeed: type:url - " + aFeed.mFeedType + " : " + aFeed.url + ); + return this.parseAsRSS2(aFeed, aDOM); + }, + + parseAsRSS2(aFeed, aDOM) { + // Get the first channel (assuming there is only one per RSS File). + let channel = aDOM.querySelector("channel"); + if (!channel) { + aFeed.onParseError(aFeed); + return []; + } + + // Usually the empty string, unless this is RSS .90. + let nsURI = channel.namespaceURI || ""; + + if (this.isPermanentRedirect(aFeed, null, channel)) { + return []; + } + + let tags = this.childrenByTagNameNS(channel, nsURI, "title"); + aFeed.title = aFeed.title || this.getNodeValue(tags ? tags[0] : null); + tags = this.childrenByTagNameNS(channel, nsURI, "description"); + aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null); + tags = this.childrenByTagNameNS(channel, nsURI, "link"); + aFeed.link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); + + if (!(aFeed.title || aFeed.description)) { + lazy.FeedUtils.log.error( + "FeedParser.parseAsRSS2: missing mandatory element " + + "<title> and <description>" + ); + // The RSS2 spec requires a <link> as well, but we can do without it + // so ignore the case of (valid) link missing. + aFeed.onParseError(aFeed); + return []; + } + + if (!aFeed.parseItems) { + return []; + } + + this.findSyUpdateTags(aFeed, channel); + + aFeed.invalidateItems(); + // XXX use getElementsByTagNameNS for now; childrenByTagNameNS would be + // better, but RSS .90 is still with us. + let itemNodes = aDOM.getElementsByTagNameNS(nsURI, "item"); + itemNodes = itemNodes ? itemNodes : []; + lazy.FeedUtils.log.debug( + "FeedParser.parseAsRSS2: items to parse - " + itemNodes.length + ); + + for (let itemNode of itemNodes) { + if (!itemNode.childElementCount) { + continue; + } + + let item = new lazy.FeedItem(); + item.feed = aFeed; + item.enclosures = []; + item.keywords = []; + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.FEEDBURNER_NS, + "origLink" + ); + let link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); + if (!link) { + tags = this.childrenByTagNameNS(itemNode, nsURI, "link"); + link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); + } + tags = this.childrenByTagNameNS(itemNode, nsURI, "guid"); + let guidNode = tags ? tags[0] : null; + + let guid; + let isPermaLink = false; + if (guidNode) { + guid = this.getNodeValue(guidNode); + // isPermaLink is true if the value is "true" or if the attribute is + // not present; all other values, including "false" and "False" and + // for that matter "TRuE" and "meatcake" are false. + if ( + !guidNode.hasAttribute("isPermaLink") || + guidNode.getAttribute("isPermaLink") == "true" + ) { + isPermaLink = true; + } + // If attribute isPermaLink is missing, it is good to check the validity + // of <guid> value as an URL to avoid linking to non-URL strings. + if (!guidNode.hasAttribute("isPermaLink")) { + try { + Services.io.newURI(guid); + if (Services.io.extractScheme(guid) == "tag") { + isPermaLink = false; + } + } catch (ex) { + isPermaLink = false; + } + } + + item.id = guid; + } + + let guidLink = this.validLink(guid); + if (isPermaLink && guidLink) { + item.url = guidLink; + } else if (link) { + item.url = link; + } else { + item.url = null; + } + + tags = this.childrenByTagNameNS(itemNode, nsURI, "description"); + item.description = this.getNodeValueFormatted(tags ? tags[0] : null); + tags = this.childrenByTagNameNS(itemNode, nsURI, "title"); + item.title = this.getNodeValue(tags ? tags[0] : null); + if (!(item.title || item.description)) { + lazy.FeedUtils.log.info( + "FeedParser.parseAsRSS2: <item> missing mandatory " + + "element, either <title> or <description>; skipping" + ); + continue; + } + + if (!item.id) { + // At this point, if there is no guid, uniqueness cannot be guaranteed + // by any of link or date (optional) or title (optional unless there + // is no description). Use a big chunk of description; minimize dupes + // with url and title if present. + item.id = + (item.url || item.feed.url) + + "#" + + item.title + + "#" + + (this.stripTags( + item.description ? item.description.substr(0, 150) : null + ) || item.title); + item.id = item.id.replace(/[\n\r\t\s]+/g, " "); + } + + // Escape html entities in <title>, which are unescaped as textContent + // values. If the title is used as content, it will remain escaped; if + // it is used as the title, it will be unescaped upon store. Bug 1240603. + // The <description> tag must follow escaping examples found in + // http://www.rssboard.org/rss-encoding-examples, i.e. single escape angle + // brackets for tags, which are removed if used as title, and double + // escape entities for presentation in title. + // Better: always use <title>. Best: use Atom. + if (!item.title) { + item.title = this.stripTags(item.description).substr(0, 150); + } else { + item.title = item.htmlEscape(item.title); + } + + tags = this.childrenByTagNameNS(itemNode, nsURI, "author"); + if (!tags) { + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.DC_NS, + "creator" + ); + } + let author = this.getNodeValue(tags ? tags[0] : null) || aFeed.title; + author = this.cleanAuthorName(author); + item.author = author ? ["<" + author + ">"] : item.author; + + tags = this.childrenByTagNameNS(itemNode, nsURI, "pubDate"); + if (!tags || !this.getNodeValue(tags[0])) { + tags = this.childrenByTagNameNS(itemNode, lazy.FeedUtils.DC_NS, "date"); + } + item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; + + // If the date is invalid, users will see the beginning of the epoch + // unless we reset it here, so they'll see the current time instead. + // This is typical aggregator behavior. + if (item.date) { + item.date = item.date.trim(); + if (!lazy.FeedUtils.isValidRFC822Date(item.date)) { + // XXX Use this on the other formats as well. + item.date = this.dateRescue(item.date); + } + } + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.RSS_CONTENT_NS, + "encoded" + ); + item.content = this.getNodeValueFormatted(tags ? tags[0] : null); + + // Handle <enclosures> and <media:content>, which may be in a + // <media:group> (if present). + tags = this.childrenByTagNameNS(itemNode, nsURI, "enclosure"); + let encUrls = []; + if (tags) { + for (let tag of tags) { + let url = this.validLink(tag.getAttribute("url")); + if (url && !encUrls.includes(url)) { + let type = this.removeUnprintableASCII(tag.getAttribute("type")); + let length = this.removeUnprintableASCII( + tag.getAttribute("length") + ); + item.enclosures.push(new lazy.FeedEnclosure(url, type, length)); + encUrls.push(url); + } + } + } + + tags = itemNode.getElementsByTagNameNS(lazy.FeedUtils.MRSS_NS, "content"); + if (tags) { + for (let tag of tags) { + let url = this.validLink(tag.getAttribute("url")); + if (url && !encUrls.includes(url)) { + let type = this.removeUnprintableASCII(tag.getAttribute("type")); + let fileSize = this.removeUnprintableASCII( + tag.getAttribute("fileSize") + ); + item.enclosures.push(new lazy.FeedEnclosure(url, type, fileSize)); + } + } + } + + // The <origEnclosureLink> tag has no specification, especially regarding + // whether more than one tag is allowed and, if so, how tags would + // relate to previously declared (and well specified) enclosure urls. + // The common usage is to include 1 origEnclosureLink, in addition to + // the specified enclosure tags for 1 enclosure. Thus, we will replace the + // first enclosure's, if found, url with the first <origEnclosureLink> + // url only or else add the <origEnclosureLink> url. + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.FEEDBURNER_NS, + "origEnclosureLink" + ); + let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null)); + if (origEncUrl) { + if (item.enclosures.length) { + item.enclosures[0].mURL = origEncUrl; + } else { + item.enclosures.push(new lazy.FeedEnclosure(origEncUrl)); + } + } + + // Support <category> and autotagging. + tags = this.childrenByTagNameNS(itemNode, nsURI, "category"); + if (tags) { + for (let tag of tags) { + let term = this.getNodeValue(tag); + term = term ? this.xmlUnescape(term.replace(/,/g, ";")) : null; + if (term && !item.keywords.includes(term)) { + item.keywords.push(term); + } + } + } + + this.parsedItems.push(item); + } + + return this.parsedItems; + }, + + /** + * Extracts feed details and (optionally) items from an RSS1 + * feed which has already been XML-parsed as an XMLDocument. + * The feed items are extracted only if feed.parseItems is set. + * + * Technically RSS1 is supposed to be treated as RDFXML, but in practice + * no feed parser anywhere ever does this, and feeds in the wild are + * pretty shakey on their RDF encoding too. So we just treat it as raw + * XML and pick out the bits we want. + * + * @param {Feed} feed - The Feed object. + * @param {XMLDocument} doc - The document to parse. + * @returns {Array} - array of FeedItems or empty array for error returns or + * nothing to do condition (ie unset feed.parseItems). + */ + parseAsRSS1(feed, doc) { + let channel = doc.querySelector("channel"); + if (!channel) { + feed.onParseError(feed); + return []; + } + + if (this.isPermanentRedirect(feed, null, channel)) { + return []; + } + + let titleNode = this.childByTagNameNS( + channel, + lazy.FeedUtils.RSS_NS, + "title" + ); + // If user entered a title manually, retain it. + feed.title = feed.title || this.getNodeValue(titleNode) || feed.url; + + let descNode = this.childByTagNameNS( + channel, + lazy.FeedUtils.RSS_NS, + "description" + ); + feed.description = this.getNodeValueFormatted(descNode) || ""; + + let linkNode = this.childByTagNameNS( + channel, + lazy.FeedUtils.RSS_NS, + "link" + ); + feed.link = this.validLink(this.getNodeValue(linkNode)) || feed.url; + + if (!(feed.title || feed.description) || !feed.link) { + lazy.FeedUtils.log.error( + "FeedParser.parseAsRSS1: missing mandatory element " + + "<title> and <description>, or <link>" + ); + feed.onParseError(feed); + return []; + } + + // If we're only interested in the overall feed description, we're done. + if (!feed.parseItems) { + return []; + } + + this.findSyUpdateTags(feed, channel); + + feed.invalidateItems(); + + // Now process all the individual items in the feed. + let itemNodes = doc.getElementsByTagNameNS(lazy.FeedUtils.RSS_NS, "item"); + itemNodes = itemNodes ? itemNodes : []; + + for (let itemNode of itemNodes) { + let item = new lazy.FeedItem(); + item.feed = feed; + + // Prefer the value of the link tag to the item URI since the URI could be + // a relative URN. + let itemURI = itemNode.getAttribute("about") || ""; + itemURI = this.removeUnprintableASCII(itemURI.trim()); + let linkNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.RSS_NS, + "link" + ); + item.id = this.getNodeValue(linkNode) || itemURI; + item.url = this.validLink(item.id); + + let descNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.RSS_NS, + "description" + ); + item.description = this.getNodeValueFormatted(descNode); + + let titleNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.RSS_NS, + "title" + ); + let subjectNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.DC_NS, + "subject" + ); + + item.title = + this.getNodeValue(titleNode) || this.getNodeValue(subjectNode); + if (!item.title && item.description) { + item.title = this.stripTags(item.description).substr(0, 150); + } + if (!item.url || !item.title) { + lazy.FeedUtils.log.info( + "FeedParser.parseAsRSS1: <item> missing mandatory " + + "element <item rdf:about> and <link>, or <title> and " + + "no <description>; skipping" + ); + continue; + } + + // TODO XXX: ignores multiple authors. + let authorNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.DC_NS, + "creator" + ); + let channelCreatorNode = this.childByTagNameNS( + channel, + lazy.FeedUtils.DC_NS, + "creator" + ); + let author = + this.getNodeValue(authorNode) || + this.getNodeValue(channelCreatorNode) || + feed.title; + author = this.cleanAuthorName(author); + item.author = author ? ["<" + author + ">"] : item.author; + + let dateNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.DC_NS, + "date" + ); + item.date = this.getNodeValue(dateNode) || item.date; + + let contentNode = this.childByTagNameNS( + itemNode, + lazy.FeedUtils.RSS_CONTENT_NS, + "encoded" + ); + item.content = this.getNodeValueFormatted(contentNode); + + this.parsedItems.push(item); + } + lazy.FeedUtils.log.debug( + "FeedParser.parseAsRSS1: items parsed - " + this.parsedItems.length + ); + + return this.parsedItems; + }, + + // TODO: deprecate ATOM_03_NS. + parseAsAtom(aFeed, aDOM) { + // Get the first channel (assuming there is only one per Atom File). + let channel = aDOM.querySelector("feed"); + if (!channel) { + aFeed.onParseError(aFeed); + return []; + } + + if (this.isPermanentRedirect(aFeed, null, channel)) { + return []; + } + + let tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_03_NS, + "title" + ); + aFeed.title = + aFeed.title || this.stripTags(this.getNodeValue(tags ? tags[0] : null)); + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_03_NS, + "tagline" + ); + aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null); + tags = this.childrenByTagNameNS(channel, lazy.FeedUtils.ATOM_03_NS, "link"); + aFeed.link = this.validLink(this.findAtomLink("alternate", tags)); + + if (!aFeed.title) { + lazy.FeedUtils.log.error( + "FeedParser.parseAsAtom: missing mandatory element <title>" + ); + aFeed.onParseError(aFeed); + return []; + } + + if (!aFeed.parseItems) { + return []; + } + + this.findSyUpdateTags(aFeed, channel); + + aFeed.invalidateItems(); + let items = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_03_NS, + "entry" + ); + items = items ? items : []; + lazy.FeedUtils.log.debug( + "FeedParser.parseAsAtom: items to parse - " + items.length + ); + + for (let itemNode of items) { + if (!itemNode.childElementCount) { + continue; + } + + let item = new lazy.FeedItem(); + item.feed = aFeed; + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "link" + ); + item.url = this.validLink(this.findAtomLink("alternate", tags)); + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "id" + ); + item.id = this.getNodeValue(tags ? tags[0] : null); + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "summary" + ); + item.description = this.getNodeValueFormatted(tags ? tags[0] : null); + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "title" + ); + item.title = + this.getNodeValue(tags ? tags[0] : null) || + (item.description ? item.description.substr(0, 150) : null); + if (!item.title || !item.id) { + // We're lenient about other mandatory tags, but insist on these. + lazy.FeedUtils.log.info( + "FeedParser.parseAsAtom: <entry> missing mandatory " + + "element <id>, or <title> and no <summary>; skipping" + ); + continue; + } + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "author" + ); + if (!tags) { + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "contributor" + ); + } + if (!tags) { + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_03_NS, + "author" + ); + } + + let authorEl = tags ? tags[0] : null; + + let author = ""; + if (authorEl) { + tags = this.childrenByTagNameNS( + authorEl, + lazy.FeedUtils.ATOM_03_NS, + "name" + ); + let name = this.getNodeValue(tags ? tags[0] : null); + tags = this.childrenByTagNameNS( + authorEl, + lazy.FeedUtils.ATOM_03_NS, + "email" + ); + let email = this.getNodeValue(tags ? tags[0] : null); + if (name) { + author = name + (email ? " <" + email + ">" : ""); + } else if (email) { + author = email; + } + } + + item.author = author || item.author || aFeed.title; + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "modified" + ); + if (!tags || !this.getNodeValue(tags[0])) { + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "issued" + ); + } + if (!tags || !this.getNodeValue(tags[0])) { + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_03_NS, + "created" + ); + } + + item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; + + // XXX We should get the xml:base attribute from the content tag as well + // and use it as the base HREF of the message. + // XXX Atom feeds can have multiple content elements; we should differentiate + // between them and pick the best one. + // Some Atom feeds wrap the content in a CTYPE declaration; others use + // a namespace to identify the tags as HTML; and a few are buggy and put + // HTML tags in without declaring their namespace so they look like Atom. + // We deal with the first two but not the third. + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_03_NS, + "content" + ); + let contentNode = tags ? tags[0] : null; + + let content; + if (contentNode) { + content = ""; + for (let node of contentNode.childNodes) { + if (node.nodeType == node.CDATA_SECTION_NODE) { + content += node.data; + } else { + content += this.mSerializer.serializeToString(node); + } + } + + if (contentNode.getAttribute("mode") == "escaped") { + content = content.replace(/</g, "<"); + content = content.replace(/>/g, ">"); + content = content.replace(/&/g, "&"); + } + + if (content == "") { + content = null; + } + } + + item.content = content; + this.parsedItems.push(item); + } + + return this.parsedItems; + }, + + parseAsAtomIETF(aFeed, aDOM) { + // Get the first channel (assuming there is only one per Atom File). + let channel = this.childrenByTagNameNS( + aDOM, + lazy.FeedUtils.ATOM_IETF_NS, + "feed" + )[0]; + if (!channel) { + aFeed.onParseError(aFeed); + return []; + } + + if (this.isPermanentRedirect(aFeed, null, channel)) { + return []; + } + + let contentBase = channel.getAttribute("xml:base"); + + let tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_IETF_NS, + "title" + ); + aFeed.title = + aFeed.title || + this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null)); + + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_IETF_NS, + "subtitle" + ); + aFeed.description = this.serializeTextConstruct(tags ? tags[0] : null); + + // Per spec, aFeed.link and contentBase may both end up null here. + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_IETF_NS, + "link" + ); + aFeed.link = + this.findAtomLink("self", tags, contentBase) || + this.findAtomLink("alternate", tags, contentBase); + aFeed.link = this.validLink(aFeed.link); + if (!contentBase) { + contentBase = aFeed.link; + } + + if (!aFeed.title) { + lazy.FeedUtils.log.error( + "FeedParser.parseAsAtomIETF: missing mandatory element <title>" + ); + aFeed.onParseError(aFeed); + return []; + } + + if (!aFeed.parseItems) { + return []; + } + + this.findSyUpdateTags(aFeed, channel); + + aFeed.invalidateItems(); + let items = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_IETF_NS, + "entry" + ); + items = items ? items : []; + lazy.FeedUtils.log.debug( + "FeedParser.parseAsAtomIETF: items to parse - " + items.length + ); + + for (let itemNode of items) { + if (!itemNode.childElementCount) { + continue; + } + + let item = new lazy.FeedItem(); + item.feed = aFeed; + item.enclosures = []; + item.keywords = []; + + contentBase = itemNode.getAttribute("xml:base") || contentBase; + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "source" + ); + let source = tags ? tags[0] : null; + + // Per spec, item.link and contentBase may both end up null here. + // If <content> is also not present, then <link rel="alternate"> is MUST + // but we're lenient. + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.FEEDBURNER_NS, + "origLink" + ); + item.url = this.validLink(this.getNodeValue(tags ? tags[0] : null)); + if (!item.url) { + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "link" + ); + item.url = + this.validLink(this.findAtomLink("alternate", tags, contentBase)) || + aFeed.link; + } + if (!contentBase) { + contentBase = item.url; + } + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "id" + ); + item.id = this.getNodeValue(tags ? tags[0] : null); + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "summary" + ); + item.description = this.serializeTextConstruct(tags ? tags[0] : null); + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "title" + ); + if (!tags || !this.getNodeValue(tags[0])) { + tags = this.childrenByTagNameNS( + source, + lazy.FeedUtils.ATOM_IETF_NS, + "title" + ); + } + item.title = this.stripTags( + this.serializeTextConstruct(tags ? tags[0] : null) || + (item.description ? item.description.substr(0, 150) : null) + ); + if (!item.title || !item.id) { + // We're lenient about other mandatory tags, but insist on these. + lazy.FeedUtils.log.info( + "FeedParser.parseAsAtomIETF: <entry> missing mandatory " + + "element <id>, or <title> and no <summary>; skipping" + ); + continue; + } + + // Support multiple authors. + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "author" + ); + if (!tags) { + tags = this.childrenByTagNameNS( + source, + lazy.FeedUtils.ATOM_IETF_NS, + "author" + ); + } + if (!tags) { + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.ATOM_IETF_NS, + "author" + ); + } + + let authorTags = tags || []; + let authors = []; + for (let authorTag of authorTags) { + let author = ""; + tags = this.childrenByTagNameNS( + authorTag, + lazy.FeedUtils.ATOM_IETF_NS, + "name" + ); + let name = this.getNodeValue(tags ? tags[0] : null); + tags = this.childrenByTagNameNS( + authorTag, + lazy.FeedUtils.ATOM_IETF_NS, + "email" + ); + let email = this.getNodeValue(tags ? tags[0] : null); + if (name) { + name = this.cleanAuthorName(name); + if (email) { + if (!email.match(/^<.*>$/)) { + email = " <" + email + ">"; + } + author = name + email; + } else { + author = "<" + name + ">"; + } + } else if (email) { + author = email; + } + + if (author) { + authors.push(author); + } + } + + if (authors.length == 0) { + tags = this.childrenByTagNameNS( + channel, + lazy.FeedUtils.DC_NS, + "publisher" + ); + let author = this.getNodeValue(tags ? tags[0] : null) || aFeed.title; + author = this.cleanAuthorName(author); + item.author = author ? ["<" + author + ">"] : item.author; + } else { + item.author = authors; + } + lazy.FeedUtils.log.trace( + "FeedParser.parseAsAtomIETF: author(s) - " + item.author + ); + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "updated" + ); + if (!tags || !this.getNodeValue(tags[0])) { + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "published" + ); + } + if (!tags || !this.getNodeValue(tags[0])) { + tags = this.childrenByTagNameNS( + source, + lazy.FeedUtils.ATOM_IETF_NS, + "published" + ); + } + item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "content" + ); + item.content = this.serializeTextConstruct(tags ? tags[0] : null); + + // Ensure relative links can be resolved and Content-Base set to an + // absolute url for the entry. But it's not mandatory that a url is found + // for Content-Base, per spec. + if (item.content) { + item.xmlContentBase = + (tags && tags[0].getAttribute("xml:base")) || contentBase; + } else if (item.description) { + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "summary" + ); + item.xmlContentBase = + (tags && tags[0].getAttribute("xml:base")) || contentBase; + } else { + item.xmlContentBase = contentBase; + } + + item.xmlContentBase = this.validLink(item.xmlContentBase); + + // Handle <link rel="enclosure"> (if present). + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "link" + ); + let encUrls = []; + if (tags) { + for (let tag of tags) { + let url = + tag.getAttribute("rel") == "enclosure" + ? (tag.getAttribute("href") || "").trim() + : null; + url = this.validLink(url); + if (url && !encUrls.includes(url)) { + let type = this.removeUnprintableASCII(tag.getAttribute("type")); + let length = this.removeUnprintableASCII( + tag.getAttribute("length") + ); + let title = this.removeUnprintableASCII(tag.getAttribute("title")); + item.enclosures.push( + new lazy.FeedEnclosure(url, type, length, title) + ); + encUrls.push(url); + } + } + } + + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.FEEDBURNER_NS, + "origEnclosureLink" + ); + let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null)); + if (origEncUrl) { + if (item.enclosures.length) { + item.enclosures[0].mURL = origEncUrl; + } else { + item.enclosures.push(new lazy.FeedEnclosure(origEncUrl)); + } + } + + // Handle atom threading extension, RFC4685. There may be 1 or more tags, + // and each must contain a ref attribute with 1 Message-Id equivalent + // value. This is the only attr of interest in the spec for presentation. + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_THREAD_NS, + "in-reply-to" + ); + if (tags) { + for (let tag of tags) { + let ref = this.removeUnprintableASCII(tag.getAttribute("ref")); + if (ref) { + item.inReplyTo += item.normalizeMessageID(ref) + " "; + } + } + item.inReplyTo = item.inReplyTo.trimRight(); + } + + // Support <category> and autotagging. + tags = this.childrenByTagNameNS( + itemNode, + lazy.FeedUtils.ATOM_IETF_NS, + "category" + ); + if (tags) { + for (let tag of tags) { + let term = this.removeUnprintableASCII(tag.getAttribute("term")); + term = term ? this.xmlUnescape(term.replace(/,/g, ";")).trim() : null; + if (term && !item.keywords.includes(term)) { + item.keywords.push(term); + } + } + } + + this.parsedItems.push(item); + } + + return this.parsedItems; + }, + + isPermanentRedirect(aFeed, aRedirDocChannel, aFeedChannel) { + // If subscribing to a new feed, do not check redirect tags. + if (!aFeed.downloadCallback || aFeed.downloadCallback.mSubscribeMode) { + return false; + } + + let tags, tagName, newUrl; + let oldUrl = aFeed.url; + + // Check for RSS2.0 redirect document <newLocation> tag. + if (aRedirDocChannel) { + tagName = "newLocation"; + tags = this.childrenByTagNameNS(aRedirDocChannel, "", tagName); + newUrl = this.getNodeValue(tags ? tags[0] : null); + } + + // Check for <itunes:new-feed-url> tag. + if (aFeedChannel) { + tagName = "new-feed-url"; + tags = this.childrenByTagNameNS( + aFeedChannel, + lazy.FeedUtils.ITUNES_NS, + tagName + ); + newUrl = this.getNodeValue(tags ? tags[0] : null); + tagName = "itunes:" + tagName; + } + + if ( + newUrl && + newUrl != oldUrl && + lazy.FeedUtils.isValidScheme(newUrl) && + lazy.FeedUtils.changeUrlForFeed(aFeed, newUrl) + ) { + lazy.FeedUtils.log.info( + "FeedParser.isPermanentRedirect: found <" + + tagName + + "> tag; updated feed url from: " + + oldUrl + + " to: " + + newUrl + + " in folder: " + + lazy.FeedUtils.getFolderPrettyPath(aFeed.folder) + ); + aFeed.onUrlChange(aFeed, oldUrl); + return true; + } + + return false; + }, + + serializeTextConstruct(textElement) { + let content = ""; + if (textElement) { + let textType = textElement.getAttribute("type"); + + // Atom spec says consider it "text" if not present. + if (!textType) { + textType = "text"; + } + + // There could be some strange content type we don't handle. + if (textType != "text" && textType != "html" && textType != "xhtml") { + return null; + } + + for (let node of textElement.childNodes) { + if (node.nodeType == node.CDATA_SECTION_NODE) { + content += this.xmlEscape(node.data); + } else { + content += this.mSerializer.serializeToString(node); + } + } + + if (textType == "html") { + content = this.xmlUnescape(content); + } + + content = content.trim(); + } + + // Other parts of the code depend on this being null if there's no content. + return content ? content : null; + }, + + /** + * Return a cleaned up author name value. + * + * @param {string} authorString - A string. + * @returns {String} - A clean string value. + */ + cleanAuthorName(authorString) { + if (!authorString) { + return ""; + } + lazy.FeedUtils.log.trace( + "FeedParser.cleanAuthor: author1 - " + authorString + ); + let author = authorString + .replace(/[\n\r\t]+/g, " ") + .replace(/"/g, '\\"') + .trim(); + // If the name contains special chars, quote it. + if (author.match(/[<>@,"]/)) { + author = '"' + author + '"'; + } + lazy.FeedUtils.log.trace("FeedParser.cleanAuthor: author2 - " + author); + + return author; + }, + + /** + * Return a cleaned up node value. This is intended for values that are not + * multiline and not formatted. A sequence of tab or newline is converted to + * a space and unprintable ascii is removed. + * + * @param {Node} node - A DOM node. + * @returns {String} - A clean string value or null. + */ + getNodeValue(node) { + let nodeValue = this.getNodeValueRaw(node); + if (!nodeValue) { + return null; + } + + nodeValue = nodeValue.replace(/[\n\r\t]+/g, " "); + return this.removeUnprintableASCII(nodeValue); + }, + + /** + * Return a cleaned up formatted node value, meaning CR/LF/TAB are retained + * while all other unprintable ascii is removed. This is intended for values + * that are multiline and formatted, such as content or description tags. + * + * @param {Node} node - A DOM node. + * @returns {String} - A clean string value or null. + */ + getNodeValueFormatted(node) { + let nodeValue = this.getNodeValueRaw(node); + if (!nodeValue) { + return null; + } + + return this.removeUnprintableASCIIexCRLFTAB(nodeValue); + }, + + /** + * Return a raw node value, as received. This should be sanitized as + * appropriate. + * + * @param {Node} node - A DOM node. + * @returns {String} - A string value or null. + */ + getNodeValueRaw(node) { + if (node && node.textContent) { + return node.textContent.trim(); + } + + if (node && node.firstChild) { + let ret = ""; + for (let child = node.firstChild; child; child = child.nextSibling) { + let value = this.getNodeValueRaw(child); + if (value) { + ret += value; + } + } + + if (ret) { + return ret.trim(); + } + } + + return null; + }, + + // Finds elements that are direct children of the first arg. + childrenByTagNameNS(aElement, aNamespace, aTagName) { + if (!aElement) { + return null; + } + + let matches = aElement.getElementsByTagNameNS(aNamespace, aTagName); + let matchingChildren = []; + for (let match of matches) { + if (match.parentNode == aElement) { + matchingChildren.push(match); + } + } + + return matchingChildren.length ? matchingChildren : null; + }, + + /** + * Returns first matching descendent of element, or null. + * + * @param {Element} element - DOM element to search. + * @param {string} namespace - Namespace of the search tag. + * @param {String} tagName - Tag to search for. + * @returns {Element|null} - Matching element, or null. + */ + childByTagNameNS(element, namespace, tagName) { + if (!element) { + return null; + } + // Handily, item() returns null for out-of-bounds access. + return element.getElementsByTagNameNS(namespace, tagName).item(0); + }, + + /** + * Ensure <link> type tags start with http[s]://, ftp:// or magnet: + * for values stored in mail headers (content-base and remote enclosures), + * particularly to prevent data: uris, javascript, and other spoofing. + * + * @param {string} link - An intended http url string. + * @returns {String} - A clean string starting with http, ftp or magnet, + * else null. + */ + validLink(link) { + if (/^((https?|ftp):\/\/|magnet:)/.test(link)) { + return this.removeUnprintableASCII(link.trim()); + } + + return null; + }, + + /** + * Return an absolute link for <entry> relative links. If xml:base is + * present in a <feed> attribute or child <link> element attribute, use it; + * otherwise the Feed.link will be the relevant <feed> child <link> value + * and will be the |baseURI| for <entry> child <link>s if there is no further + * xml:base, which may be an attribute of any element. + * + * @param {string} linkRel - the <link> rel attribute value to find. + * @param {NodeList} linkElements - the nodelist of <links> to search in. + * @param {string} baseURI - the url to use when resolving relative + * links to absolute values. + * @returns {String} or null - absolute url for a <link>, or null if the + * rel type is not found. + */ + findAtomLink(linkRel, linkElements, baseURI) { + if (!linkElements) { + return null; + } + + // XXX Need to check for MIME type and hreflang. + for (let alink of linkElements) { + if ( + alink && + // If there's a link rel. + ((alink.getAttribute("rel") && alink.getAttribute("rel") == linkRel) || + // If there isn't, assume 'alternate'. + (!alink.getAttribute("rel") && linkRel == "alternate")) && + alink.getAttribute("href") + ) { + // Atom links are interpreted relative to xml:base. + let href = alink.getAttribute("href"); + baseURI = alink.getAttribute("xml:base") || baseURI || href; + try { + return Services.io.newURI(baseURI).resolve(href); + } catch (ex) {} + } + } + + return null; + }, + + /** + * Find RSS Syndication extension tags. + * http://web.resource.org/rss/1.0/modules/syndication/ + * + * @param {Feed} aFeed - the feed object. + * @param {Node | String} aChannel - dom node for the <channel>. + * @returns {void} + */ + findSyUpdateTags(aFeed, aChannel) { + let tag, updatePeriod, updateFrequency, updateBase; + tag = this.childrenByTagNameNS( + aChannel, + lazy.FeedUtils.RSS_SY_NS, + "updatePeriod" + ); + updatePeriod = this.getNodeValue(tag ? tag[0] : null) || ""; + tag = this.childrenByTagNameNS( + aChannel, + lazy.FeedUtils.RSS_SY_NS, + "updateFrequency" + ); + updateFrequency = this.getNodeValue(tag ? tag[0] : null) || ""; + tag = this.childrenByTagNameNS( + aChannel, + lazy.FeedUtils.RSS_SY_NS, + "updateBase" + ); + updateBase = this.getNodeValue(tag ? tag[0] : null) || ""; + lazy.FeedUtils.log.debug( + "FeedParser.findSyUpdateTags: updatePeriod:updateFrequency - " + + updatePeriod + + ":" + + updateFrequency + ); + + if (updatePeriod) { + if (lazy.FeedUtils.RSS_SY_UNITS.includes(updatePeriod.toLowerCase())) { + updatePeriod = updatePeriod.toLowerCase(); + } else { + updatePeriod = "daily"; + } + } + + updateFrequency = isNaN(updateFrequency) ? 1 : updateFrequency; + + let options = aFeed.options; + if ( + options.updates.updatePeriod == updatePeriod && + options.updates.updateFrequency == updateFrequency && + options.updates.updateBase == updateBase + ) { + return; + } + + options.updates.updatePeriod = updatePeriod; + options.updates.updateFrequency = updateFrequency; + options.updates.updateBase = updateBase; + aFeed.options = options; + }, + + /** + * Remove unprintable ascii, particularly CR/LF, for non formatted tag values. + * + * @param {string} s - String to clean. + * @returns {String} - Cleaned string. + */ + removeUnprintableASCII(s) { + /* eslint-disable-next-line no-control-regex */ + return s ? s.replace(/[\x00-\x1F\x7F]+/g, "") : ""; + }, + + /** + * Remove unprintable ascii, except CR/LF/TAB, for formatted tag values. + * + * @param {string} s - String to clean. + * @returns {String} - Cleaned string. + */ + removeUnprintableASCIIexCRLFTAB(s) { + /* eslint-disable-next-line no-control-regex */ + return s ? s.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+/g, "") : ""; + }, + + stripTags(someHTML) { + return someHTML ? someHTML.replace(/<[^>]+>/g, "") : someHTML; + }, + + xmlUnescape(s) { + s = s.replace(/</g, "<"); + s = s.replace(/>/g, ">"); + s = s.replace(/&/g, "&"); + return s; + }, + + xmlEscape(s) { + s = s.replace(/&/g, "&"); + s = s.replace(/>/g, ">"); + s = s.replace(/</g, "<"); + return s; + }, + + dateRescue(dateString) { + // Deal with various kinds of invalid dates. + if (!isNaN(parseInt(dateString))) { + // It's an integer, so maybe it's a timestamp. + let d = new Date(parseInt(dateString) * 1000); + let now = new Date(); + let yeardiff = now.getFullYear() - d.getFullYear(); + lazy.FeedUtils.log.trace( + "FeedParser.dateRescue: Rescue Timestamp date - " + + d.toString() + + " ,year diff - " + + yeardiff + ); + if (yeardiff >= 0 && yeardiff < 3) { + // It's quite likely the correct date. + return d.toString(); + } + } + + // Could be an ISO8601/W3C date. If not, get the current time. + return lazy.FeedUtils.getValidRFC5322Date(dateString); + }, +}; |