/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ const EXPORTED_SYMBOLS = ["FeedParser"]; const lazy = {}; ChromeUtils.defineModuleGetter( lazy, "FeedItem", "resource:///modules/FeedItem.jsm" ); ChromeUtils.defineModuleGetter( lazy, "FeedEnclosure", "resource:///modules/FeedItem.jsm" ); ChromeUtils.defineModuleGetter( lazy, "FeedUtils", "resource:///modules/FeedUtils.jsm" ); /** * The feed parser. Depends on FeedItem.js, Feed.js. * * @class */ function FeedParser() { this.parsedItems = []; this.mSerializer = new XMLSerializer(); } FeedParser.prototype = { /** * parseFeed() returns an array of parsed items ready for processing. It is * currently a synchronous operation. If there is an error parsing the feed, * parseFeed returns an empty feed in addition to calling aFeed.onParseError. * * @param {Feed} aFeed - The Feed object. * @param {XMLDocument} aDOM - The document to parse. * @returns {Array} - array of items, or empty array for error returns or * nothing to do condition. */ parseFeed(aFeed, aDOM) { if (!XMLDocument.isInstance(aDOM)) { // No xml doc. aFeed.onParseError(aFeed); return []; } let doc = aDOM.documentElement; if (doc.namespaceURI == lazy.FeedUtils.MOZ_PARSERERROR_NS) { // Gecko caught a basic parsing error. let errStr = doc.firstChild.textContent + "\n" + doc.firstElementChild.textContent; lazy.FeedUtils.log.info("FeedParser.parseFeed: - " + errStr); aFeed.onParseError(aFeed); return []; } else if (aDOM.querySelector("redirect")) { // Check for RSS2.0 redirect document. let channel = aDOM.querySelector("redirect"); if (this.isPermanentRedirect(aFeed, channel, null)) { return []; } aFeed.onParseError(aFeed); return []; } else if ( doc.namespaceURI == lazy.FeedUtils.RDF_SYNTAX_NS && doc.getElementsByTagNameNS(lazy.FeedUtils.RSS_NS, "channel")[0] ) { aFeed.mFeedType = "RSS_1.xRDF"; lazy.FeedUtils.log.debug( "FeedParser.parseFeed: type:url - " + aFeed.mFeedType + " : " + aFeed.url ); return this.parseAsRSS1(aFeed, aDOM); } else if (doc.namespaceURI == lazy.FeedUtils.ATOM_03_NS) { aFeed.mFeedType = "ATOM_0.3"; lazy.FeedUtils.log.debug( "FeedParser.parseFeed: type:url - " + aFeed.mFeedType + " : " + aFeed.url ); return this.parseAsAtom(aFeed, aDOM); } else if (doc.namespaceURI == lazy.FeedUtils.ATOM_IETF_NS) { aFeed.mFeedType = "ATOM_IETF"; lazy.FeedUtils.log.debug( "FeedParser.parseFeed: type:url - " + aFeed.mFeedType + " : " + aFeed.url ); return this.parseAsAtomIETF(aFeed, aDOM); } else if ( doc.getElementsByTagNameNS(lazy.FeedUtils.RSS_090_NS, "channel")[0] ) { aFeed.mFeedType = "RSS_0.90"; lazy.FeedUtils.log.debug( "FeedParser.parseFeed: type:url - " + aFeed.mFeedType + " : " + aFeed.url ); return this.parseAsRSS2(aFeed, aDOM); } // Parse as RSS 0.9x. In theory even RSS 1.0 feeds could be parsed by // the 0.9x parser if the RSS namespace were the default. let rssVer = doc.localName == "rss" ? doc.getAttribute("version") : null; if (rssVer) { aFeed.mFeedType = "RSS_" + rssVer; } else { aFeed.mFeedType = "RSS_0.9x?"; } lazy.FeedUtils.log.debug( "FeedParser.parseFeed: type:url - " + aFeed.mFeedType + " : " + aFeed.url ); return this.parseAsRSS2(aFeed, aDOM); }, parseAsRSS2(aFeed, aDOM) { // Get the first channel (assuming there is only one per RSS File). let channel = aDOM.querySelector("channel"); if (!channel) { aFeed.onParseError(aFeed); return []; } // Usually the empty string, unless this is RSS .90. let nsURI = channel.namespaceURI || ""; if (this.isPermanentRedirect(aFeed, null, channel)) { return []; } let tags = this.childrenByTagNameNS(channel, nsURI, "title"); aFeed.title = aFeed.title || this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, nsURI, "description"); aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, nsURI, "link"); aFeed.link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (!(aFeed.title || aFeed.description)) { lazy.FeedUtils.log.error( "FeedParser.parseAsRSS2: missing mandatory element " + " and <description>" ); // The RSS2 spec requires a <link> as well, but we can do without it // so ignore the case of (valid) link missing. aFeed.onParseError(aFeed); return []; } if (!aFeed.parseItems) { return []; } this.findSyUpdateTags(aFeed, channel); aFeed.invalidateItems(); // XXX use getElementsByTagNameNS for now; childrenByTagNameNS would be // better, but RSS .90 is still with us. let itemNodes = aDOM.getElementsByTagNameNS(nsURI, "item"); itemNodes = itemNodes ? itemNodes : []; lazy.FeedUtils.log.debug( "FeedParser.parseAsRSS2: items to parse - " + itemNodes.length ); for (let itemNode of itemNodes) { if (!itemNode.childElementCount) { continue; } let item = new lazy.FeedItem(); item.feed = aFeed; item.enclosures = []; item.keywords = []; tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.FEEDBURNER_NS, "origLink" ); let link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (!link) { tags = this.childrenByTagNameNS(itemNode, nsURI, "link"); link = this.validLink(this.getNodeValue(tags ? tags[0] : null)); } tags = this.childrenByTagNameNS(itemNode, nsURI, "guid"); let guidNode = tags ? tags[0] : null; let guid; let isPermaLink = false; if (guidNode) { guid = this.getNodeValue(guidNode); // isPermaLink is true if the value is "true" or if the attribute is // not present; all other values, including "false" and "False" and // for that matter "TRuE" and "meatcake" are false. if ( !guidNode.hasAttribute("isPermaLink") || guidNode.getAttribute("isPermaLink") == "true" ) { isPermaLink = true; } // If attribute isPermaLink is missing, it is good to check the validity // of <guid> value as an URL to avoid linking to non-URL strings. if (!guidNode.hasAttribute("isPermaLink")) { try { Services.io.newURI(guid); if (Services.io.extractScheme(guid) == "tag") { isPermaLink = false; } } catch (ex) { isPermaLink = false; } } item.id = guid; } let guidLink = this.validLink(guid); if (isPermaLink && guidLink) { item.url = guidLink; } else if (link) { item.url = link; } else { item.url = null; } tags = this.childrenByTagNameNS(itemNode, nsURI, "description"); item.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(itemNode, nsURI, "title"); item.title = this.getNodeValue(tags ? tags[0] : null); if (!(item.title || item.description)) { lazy.FeedUtils.log.info( "FeedParser.parseAsRSS2: <item> missing mandatory " + "element, either <title> or <description>; skipping" ); continue; } if (!item.id) { // At this point, if there is no guid, uniqueness cannot be guaranteed // by any of link or date (optional) or title (optional unless there // is no description). Use a big chunk of description; minimize dupes // with url and title if present. item.id = (item.url || item.feed.url) + "#" + item.title + "#" + (this.stripTags( item.description ? item.description.substr(0, 150) : null ) || item.title); item.id = item.id.replace(/[\n\r\t\s]+/g, " "); } // Escape html entities in <title>, which are unescaped as textContent // values. If the title is used as content, it will remain escaped; if // it is used as the title, it will be unescaped upon store. Bug 1240603. // The <description> tag must follow escaping examples found in // http://www.rssboard.org/rss-encoding-examples, i.e. single escape angle // brackets for tags, which are removed if used as title, and double // escape entities for presentation in title. // Better: always use <title>. Best: use Atom. if (!item.title) { item.title = this.stripTags(item.description).substr(0, 150); } else { item.title = item.htmlEscape(item.title); } tags = this.childrenByTagNameNS(itemNode, nsURI, "author"); if (!tags) { tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.DC_NS, "creator" ); } let author = this.getNodeValue(tags ? tags[0] : null) || aFeed.title; author = this.cleanAuthorName(author); item.author = author ? ["<" + author + ">"] : item.author; tags = this.childrenByTagNameNS(itemNode, nsURI, "pubDate"); if (!tags || !this.getNodeValue(tags[0])) { tags = this.childrenByTagNameNS(itemNode, lazy.FeedUtils.DC_NS, "date"); } item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; // If the date is invalid, users will see the beginning of the epoch // unless we reset it here, so they'll see the current time instead. // This is typical aggregator behavior. if (item.date) { item.date = item.date.trim(); if (!lazy.FeedUtils.isValidRFC822Date(item.date)) { // XXX Use this on the other formats as well. item.date = this.dateRescue(item.date); } } tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.RSS_CONTENT_NS, "encoded" ); item.content = this.getNodeValueFormatted(tags ? tags[0] : null); // Handle <enclosures> and <media:content>, which may be in a // <media:group> (if present). tags = this.childrenByTagNameNS(itemNode, nsURI, "enclosure"); let encUrls = []; if (tags) { for (let tag of tags) { let url = this.validLink(tag.getAttribute("url")); if (url && !encUrls.includes(url)) { let type = this.removeUnprintableASCII(tag.getAttribute("type")); let length = this.removeUnprintableASCII( tag.getAttribute("length") ); item.enclosures.push(new lazy.FeedEnclosure(url, type, length)); encUrls.push(url); } } } tags = itemNode.getElementsByTagNameNS(lazy.FeedUtils.MRSS_NS, "content"); if (tags) { for (let tag of tags) { let url = this.validLink(tag.getAttribute("url")); if (url && !encUrls.includes(url)) { let type = this.removeUnprintableASCII(tag.getAttribute("type")); let fileSize = this.removeUnprintableASCII( tag.getAttribute("fileSize") ); item.enclosures.push(new lazy.FeedEnclosure(url, type, fileSize)); } } } // The <origEnclosureLink> tag has no specification, especially regarding // whether more than one tag is allowed and, if so, how tags would // relate to previously declared (and well specified) enclosure urls. // The common usage is to include 1 origEnclosureLink, in addition to // the specified enclosure tags for 1 enclosure. Thus, we will replace the // first enclosure's, if found, url with the first <origEnclosureLink> // url only or else add the <origEnclosureLink> url. tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.FEEDBURNER_NS, "origEnclosureLink" ); let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (origEncUrl) { if (item.enclosures.length) { item.enclosures[0].mURL = origEncUrl; } else { item.enclosures.push(new lazy.FeedEnclosure(origEncUrl)); } } // Support <category> and autotagging. tags = this.childrenByTagNameNS(itemNode, nsURI, "category"); if (tags) { for (let tag of tags) { let term = this.getNodeValue(tag); term = term ? this.xmlUnescape(term.replace(/,/g, ";")) : null; if (term && !item.keywords.includes(term)) { item.keywords.push(term); } } } this.parsedItems.push(item); } return this.parsedItems; }, /** * Extracts feed details and (optionally) items from an RSS1 * feed which has already been XML-parsed as an XMLDocument. * The feed items are extracted only if feed.parseItems is set. * * Technically RSS1 is supposed to be treated as RDFXML, but in practice * no feed parser anywhere ever does this, and feeds in the wild are * pretty shakey on their RDF encoding too. So we just treat it as raw * XML and pick out the bits we want. * * @param {Feed} feed - The Feed object. * @param {XMLDocument} doc - The document to parse. * @returns {Array} - array of FeedItems or empty array for error returns or * nothing to do condition (ie unset feed.parseItems). */ parseAsRSS1(feed, doc) { let channel = doc.querySelector("channel"); if (!channel) { feed.onParseError(feed); return []; } if (this.isPermanentRedirect(feed, null, channel)) { return []; } let titleNode = this.childByTagNameNS( channel, lazy.FeedUtils.RSS_NS, "title" ); // If user entered a title manually, retain it. feed.title = feed.title || this.getNodeValue(titleNode) || feed.url; let descNode = this.childByTagNameNS( channel, lazy.FeedUtils.RSS_NS, "description" ); feed.description = this.getNodeValueFormatted(descNode) || ""; let linkNode = this.childByTagNameNS( channel, lazy.FeedUtils.RSS_NS, "link" ); feed.link = this.validLink(this.getNodeValue(linkNode)) || feed.url; if (!(feed.title || feed.description) || !feed.link) { lazy.FeedUtils.log.error( "FeedParser.parseAsRSS1: missing mandatory element " + "<title> and <description>, or <link>" ); feed.onParseError(feed); return []; } // If we're only interested in the overall feed description, we're done. if (!feed.parseItems) { return []; } this.findSyUpdateTags(feed, channel); feed.invalidateItems(); // Now process all the individual items in the feed. let itemNodes = doc.getElementsByTagNameNS(lazy.FeedUtils.RSS_NS, "item"); itemNodes = itemNodes ? itemNodes : []; for (let itemNode of itemNodes) { let item = new lazy.FeedItem(); item.feed = feed; // Prefer the value of the link tag to the item URI since the URI could be // a relative URN. let itemURI = itemNode.getAttribute("about") || ""; itemURI = this.removeUnprintableASCII(itemURI.trim()); let linkNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.RSS_NS, "link" ); item.id = this.getNodeValue(linkNode) || itemURI; item.url = this.validLink(item.id); let descNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.RSS_NS, "description" ); item.description = this.getNodeValueFormatted(descNode); let titleNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.RSS_NS, "title" ); let subjectNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.DC_NS, "subject" ); item.title = this.getNodeValue(titleNode) || this.getNodeValue(subjectNode); if (!item.title && item.description) { item.title = this.stripTags(item.description).substr(0, 150); } if (!item.url || !item.title) { lazy.FeedUtils.log.info( "FeedParser.parseAsRSS1: <item> missing mandatory " + "element <item rdf:about> and <link>, or <title> and " + "no <description>; skipping" ); continue; } // TODO XXX: ignores multiple authors. let authorNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.DC_NS, "creator" ); let channelCreatorNode = this.childByTagNameNS( channel, lazy.FeedUtils.DC_NS, "creator" ); let author = this.getNodeValue(authorNode) || this.getNodeValue(channelCreatorNode) || feed.title; author = this.cleanAuthorName(author); item.author = author ? ["<" + author + ">"] : item.author; let dateNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.DC_NS, "date" ); item.date = this.getNodeValue(dateNode) || item.date; let contentNode = this.childByTagNameNS( itemNode, lazy.FeedUtils.RSS_CONTENT_NS, "encoded" ); item.content = this.getNodeValueFormatted(contentNode); this.parsedItems.push(item); } lazy.FeedUtils.log.debug( "FeedParser.parseAsRSS1: items parsed - " + this.parsedItems.length ); return this.parsedItems; }, // TODO: deprecate ATOM_03_NS. parseAsAtom(aFeed, aDOM) { // Get the first channel (assuming there is only one per Atom File). let channel = aDOM.querySelector("feed"); if (!channel) { aFeed.onParseError(aFeed); return []; } if (this.isPermanentRedirect(aFeed, null, channel)) { return []; } let tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_03_NS, "title" ); aFeed.title = aFeed.title || this.stripTags(this.getNodeValue(tags ? tags[0] : null)); tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_03_NS, "tagline" ); aFeed.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS(channel, lazy.FeedUtils.ATOM_03_NS, "link"); aFeed.link = this.validLink(this.findAtomLink("alternate", tags)); if (!aFeed.title) { lazy.FeedUtils.log.error( "FeedParser.parseAsAtom: missing mandatory element <title>" ); aFeed.onParseError(aFeed); return []; } if (!aFeed.parseItems) { return []; } this.findSyUpdateTags(aFeed, channel); aFeed.invalidateItems(); let items = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_03_NS, "entry" ); items = items ? items : []; lazy.FeedUtils.log.debug( "FeedParser.parseAsAtom: items to parse - " + items.length ); for (let itemNode of items) { if (!itemNode.childElementCount) { continue; } let item = new lazy.FeedItem(); item.feed = aFeed; tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "link" ); item.url = this.validLink(this.findAtomLink("alternate", tags)); tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "id" ); item.id = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "summary" ); item.description = this.getNodeValueFormatted(tags ? tags[0] : null); tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "title" ); item.title = this.getNodeValue(tags ? tags[0] : null) || (item.description ? item.description.substr(0, 150) : null); if (!item.title || !item.id) { // We're lenient about other mandatory tags, but insist on these. lazy.FeedUtils.log.info( "FeedParser.parseAsAtom: <entry> missing mandatory " + "element <id>, or <title> and no <summary>; skipping" ); continue; } tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "author" ); if (!tags) { tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "contributor" ); } if (!tags) { tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_03_NS, "author" ); } let authorEl = tags ? tags[0] : null; let author = ""; if (authorEl) { tags = this.childrenByTagNameNS( authorEl, lazy.FeedUtils.ATOM_03_NS, "name" ); let name = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS( authorEl, lazy.FeedUtils.ATOM_03_NS, "email" ); let email = this.getNodeValue(tags ? tags[0] : null); if (name) { author = name + (email ? " <" + email + ">" : ""); } else if (email) { author = email; } } item.author = author || item.author || aFeed.title; tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "modified" ); if (!tags || !this.getNodeValue(tags[0])) { tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "issued" ); } if (!tags || !this.getNodeValue(tags[0])) { tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_03_NS, "created" ); } item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; // XXX We should get the xml:base attribute from the content tag as well // and use it as the base HREF of the message. // XXX Atom feeds can have multiple content elements; we should differentiate // between them and pick the best one. // Some Atom feeds wrap the content in a CTYPE declaration; others use // a namespace to identify the tags as HTML; and a few are buggy and put // HTML tags in without declaring their namespace so they look like Atom. // We deal with the first two but not the third. tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_03_NS, "content" ); let contentNode = tags ? tags[0] : null; let content; if (contentNode) { content = ""; for (let node of contentNode.childNodes) { if (node.nodeType == node.CDATA_SECTION_NODE) { content += node.data; } else { content += this.mSerializer.serializeToString(node); } } if (contentNode.getAttribute("mode") == "escaped") { content = content.replace(/</g, "<"); content = content.replace(/>/g, ">"); content = content.replace(/&/g, "&"); } if (content == "") { content = null; } } item.content = content; this.parsedItems.push(item); } return this.parsedItems; }, parseAsAtomIETF(aFeed, aDOM) { // Get the first channel (assuming there is only one per Atom File). let channel = this.childrenByTagNameNS( aDOM, lazy.FeedUtils.ATOM_IETF_NS, "feed" )[0]; if (!channel) { aFeed.onParseError(aFeed); return []; } if (this.isPermanentRedirect(aFeed, null, channel)) { return []; } let contentBase = channel.getAttribute("xml:base"); let tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_IETF_NS, "title" ); aFeed.title = aFeed.title || this.stripTags(this.serializeTextConstruct(tags ? tags[0] : null)); tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_IETF_NS, "subtitle" ); aFeed.description = this.serializeTextConstruct(tags ? tags[0] : null); // Per spec, aFeed.link and contentBase may both end up null here. tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_IETF_NS, "link" ); aFeed.link = this.findAtomLink("self", tags, contentBase) || this.findAtomLink("alternate", tags, contentBase); aFeed.link = this.validLink(aFeed.link); if (!contentBase) { contentBase = aFeed.link; } if (!aFeed.title) { lazy.FeedUtils.log.error( "FeedParser.parseAsAtomIETF: missing mandatory element <title>" ); aFeed.onParseError(aFeed); return []; } if (!aFeed.parseItems) { return []; } this.findSyUpdateTags(aFeed, channel); aFeed.invalidateItems(); let items = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_IETF_NS, "entry" ); items = items ? items : []; lazy.FeedUtils.log.debug( "FeedParser.parseAsAtomIETF: items to parse - " + items.length ); for (let itemNode of items) { if (!itemNode.childElementCount) { continue; } let item = new lazy.FeedItem(); item.feed = aFeed; item.enclosures = []; item.keywords = []; contentBase = itemNode.getAttribute("xml:base") || contentBase; tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "source" ); let source = tags ? tags[0] : null; // Per spec, item.link and contentBase may both end up null here. // If <content> is also not present, then <link rel="alternate"> is MUST // but we're lenient. tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.FEEDBURNER_NS, "origLink" ); item.url = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (!item.url) { tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "link" ); item.url = this.validLink(this.findAtomLink("alternate", tags, contentBase)) || aFeed.link; } if (!contentBase) { contentBase = item.url; } tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "id" ); item.id = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "summary" ); item.description = this.serializeTextConstruct(tags ? tags[0] : null); tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "title" ); if (!tags || !this.getNodeValue(tags[0])) { tags = this.childrenByTagNameNS( source, lazy.FeedUtils.ATOM_IETF_NS, "title" ); } item.title = this.stripTags( this.serializeTextConstruct(tags ? tags[0] : null) || (item.description ? item.description.substr(0, 150) : null) ); if (!item.title || !item.id) { // We're lenient about other mandatory tags, but insist on these. lazy.FeedUtils.log.info( "FeedParser.parseAsAtomIETF: <entry> missing mandatory " + "element <id>, or <title> and no <summary>; skipping" ); continue; } // Support multiple authors. tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "author" ); if (!tags) { tags = this.childrenByTagNameNS( source, lazy.FeedUtils.ATOM_IETF_NS, "author" ); } if (!tags) { tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.ATOM_IETF_NS, "author" ); } let authorTags = tags || []; let authors = []; for (let authorTag of authorTags) { let author = ""; tags = this.childrenByTagNameNS( authorTag, lazy.FeedUtils.ATOM_IETF_NS, "name" ); let name = this.getNodeValue(tags ? tags[0] : null); tags = this.childrenByTagNameNS( authorTag, lazy.FeedUtils.ATOM_IETF_NS, "email" ); let email = this.getNodeValue(tags ? tags[0] : null); if (name) { name = this.cleanAuthorName(name); if (email) { if (!email.match(/^<.*>$/)) { email = " <" + email + ">"; } author = name + email; } else { author = "<" + name + ">"; } } else if (email) { author = email; } if (author) { authors.push(author); } } if (authors.length == 0) { tags = this.childrenByTagNameNS( channel, lazy.FeedUtils.DC_NS, "publisher" ); let author = this.getNodeValue(tags ? tags[0] : null) || aFeed.title; author = this.cleanAuthorName(author); item.author = author ? ["<" + author + ">"] : item.author; } else { item.author = authors; } lazy.FeedUtils.log.trace( "FeedParser.parseAsAtomIETF: author(s) - " + item.author ); tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "updated" ); if (!tags || !this.getNodeValue(tags[0])) { tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "published" ); } if (!tags || !this.getNodeValue(tags[0])) { tags = this.childrenByTagNameNS( source, lazy.FeedUtils.ATOM_IETF_NS, "published" ); } item.date = this.getNodeValue(tags ? tags[0] : null) || item.date; tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "content" ); item.content = this.serializeTextConstruct(tags ? tags[0] : null); // Ensure relative links can be resolved and Content-Base set to an // absolute url for the entry. But it's not mandatory that a url is found // for Content-Base, per spec. if (item.content) { item.xmlContentBase = (tags && tags[0].getAttribute("xml:base")) || contentBase; } else if (item.description) { tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "summary" ); item.xmlContentBase = (tags && tags[0].getAttribute("xml:base")) || contentBase; } else { item.xmlContentBase = contentBase; } item.xmlContentBase = this.validLink(item.xmlContentBase); // Handle <link rel="enclosure"> (if present). tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "link" ); let encUrls = []; if (tags) { for (let tag of tags) { let url = tag.getAttribute("rel") == "enclosure" ? (tag.getAttribute("href") || "").trim() : null; url = this.validLink(url); if (url && !encUrls.includes(url)) { let type = this.removeUnprintableASCII(tag.getAttribute("type")); let length = this.removeUnprintableASCII( tag.getAttribute("length") ); let title = this.removeUnprintableASCII(tag.getAttribute("title")); item.enclosures.push( new lazy.FeedEnclosure(url, type, length, title) ); encUrls.push(url); } } } tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.FEEDBURNER_NS, "origEnclosureLink" ); let origEncUrl = this.validLink(this.getNodeValue(tags ? tags[0] : null)); if (origEncUrl) { if (item.enclosures.length) { item.enclosures[0].mURL = origEncUrl; } else { item.enclosures.push(new lazy.FeedEnclosure(origEncUrl)); } } // Handle atom threading extension, RFC4685. There may be 1 or more tags, // and each must contain a ref attribute with 1 Message-Id equivalent // value. This is the only attr of interest in the spec for presentation. tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_THREAD_NS, "in-reply-to" ); if (tags) { for (let tag of tags) { let ref = this.removeUnprintableASCII(tag.getAttribute("ref")); if (ref) { item.inReplyTo += item.normalizeMessageID(ref) + " "; } } item.inReplyTo = item.inReplyTo.trimRight(); } // Support <category> and autotagging. tags = this.childrenByTagNameNS( itemNode, lazy.FeedUtils.ATOM_IETF_NS, "category" ); if (tags) { for (let tag of tags) { let term = this.removeUnprintableASCII(tag.getAttribute("term")); term = term ? this.xmlUnescape(term.replace(/,/g, ";")).trim() : null; if (term && !item.keywords.includes(term)) { item.keywords.push(term); } } } this.parsedItems.push(item); } return this.parsedItems; }, isPermanentRedirect(aFeed, aRedirDocChannel, aFeedChannel) { // If subscribing to a new feed, do not check redirect tags. if (!aFeed.downloadCallback || aFeed.downloadCallback.mSubscribeMode) { return false; } let tags, tagName, newUrl; let oldUrl = aFeed.url; // Check for RSS2.0 redirect document <newLocation> tag. if (aRedirDocChannel) { tagName = "newLocation"; tags = this.childrenByTagNameNS(aRedirDocChannel, "", tagName); newUrl = this.getNodeValue(tags ? tags[0] : null); } // Check for <itunes:new-feed-url> tag. if (aFeedChannel) { tagName = "new-feed-url"; tags = this.childrenByTagNameNS( aFeedChannel, lazy.FeedUtils.ITUNES_NS, tagName ); newUrl = this.getNodeValue(tags ? tags[0] : null); tagName = "itunes:" + tagName; } if ( newUrl && newUrl != oldUrl && lazy.FeedUtils.isValidScheme(newUrl) && lazy.FeedUtils.changeUrlForFeed(aFeed, newUrl) ) { lazy.FeedUtils.log.info( "FeedParser.isPermanentRedirect: found <" + tagName + "> tag; updated feed url from: " + oldUrl + " to: " + newUrl + " in folder: " + lazy.FeedUtils.getFolderPrettyPath(aFeed.folder) ); aFeed.onUrlChange(aFeed, oldUrl); return true; } return false; }, serializeTextConstruct(textElement) { let content = ""; if (textElement) { let textType = textElement.getAttribute("type"); // Atom spec says consider it "text" if not present. if (!textType) { textType = "text"; } // There could be some strange content type we don't handle. if (textType != "text" && textType != "html" && textType != "xhtml") { return null; } for (let node of textElement.childNodes) { if (node.nodeType == node.CDATA_SECTION_NODE) { content += this.xmlEscape(node.data); } else { content += this.mSerializer.serializeToString(node); } } if (textType == "html") { content = this.xmlUnescape(content); } content = content.trim(); } // Other parts of the code depend on this being null if there's no content. return content ? content : null; }, /** * Return a cleaned up author name value. * * @param {string} authorString - A string. * @returns {String} - A clean string value. */ cleanAuthorName(authorString) { if (!authorString) { return ""; } lazy.FeedUtils.log.trace( "FeedParser.cleanAuthor: author1 - " + authorString ); let author = authorString .replace(/[\n\r\t]+/g, " ") .replace(/"/g, '\\"') .trim(); // If the name contains special chars, quote it. if (author.match(/[<>@,"]/)) { author = '"' + author + '"'; } lazy.FeedUtils.log.trace("FeedParser.cleanAuthor: author2 - " + author); return author; }, /** * Return a cleaned up node value. This is intended for values that are not * multiline and not formatted. A sequence of tab or newline is converted to * a space and unprintable ascii is removed. * * @param {Node} node - A DOM node. * @returns {String} - A clean string value or null. */ getNodeValue(node) { let nodeValue = this.getNodeValueRaw(node); if (!nodeValue) { return null; } nodeValue = nodeValue.replace(/[\n\r\t]+/g, " "); return this.removeUnprintableASCII(nodeValue); }, /** * Return a cleaned up formatted node value, meaning CR/LF/TAB are retained * while all other unprintable ascii is removed. This is intended for values * that are multiline and formatted, such as content or description tags. * * @param {Node} node - A DOM node. * @returns {String} - A clean string value or null. */ getNodeValueFormatted(node) { let nodeValue = this.getNodeValueRaw(node); if (!nodeValue) { return null; } return this.removeUnprintableASCIIexCRLFTAB(nodeValue); }, /** * Return a raw node value, as received. This should be sanitized as * appropriate. * * @param {Node} node - A DOM node. * @returns {String} - A string value or null. */ getNodeValueRaw(node) { if (node && node.textContent) { return node.textContent.trim(); } if (node && node.firstChild) { let ret = ""; for (let child = node.firstChild; child; child = child.nextSibling) { let value = this.getNodeValueRaw(child); if (value) { ret += value; } } if (ret) { return ret.trim(); } } return null; }, // Finds elements that are direct children of the first arg. childrenByTagNameNS(aElement, aNamespace, aTagName) { if (!aElement) { return null; } let matches = aElement.getElementsByTagNameNS(aNamespace, aTagName); let matchingChildren = []; for (let match of matches) { if (match.parentNode == aElement) { matchingChildren.push(match); } } return matchingChildren.length ? matchingChildren : null; }, /** * Returns first matching descendent of element, or null. * * @param {Element} element - DOM element to search. * @param {string} namespace - Namespace of the search tag. * @param {String} tagName - Tag to search for. * @returns {Element|null} - Matching element, or null. */ childByTagNameNS(element, namespace, tagName) { if (!element) { return null; } // Handily, item() returns null for out-of-bounds access. return element.getElementsByTagNameNS(namespace, tagName).item(0); }, /** * Ensure <link> type tags start with http[s]://, ftp:// or magnet: * for values stored in mail headers (content-base and remote enclosures), * particularly to prevent data: uris, javascript, and other spoofing. * * @param {string} link - An intended http url string. * @returns {String} - A clean string starting with http, ftp or magnet, * else null. */ validLink(link) { if (/^((https?|ftp):\/\/|magnet:)/.test(link)) { return this.removeUnprintableASCII(link.trim()); } return null; }, /** * Return an absolute link for <entry> relative links. If xml:base is * present in a <feed> attribute or child <link> element attribute, use it; * otherwise the Feed.link will be the relevant <feed> child <link> value * and will be the |baseURI| for <entry> child <link>s if there is no further * xml:base, which may be an attribute of any element. * * @param {string} linkRel - the <link> rel attribute value to find. * @param {NodeList} linkElements - the nodelist of <links> to search in. * @param {string} baseURI - the url to use when resolving relative * links to absolute values. * @returns {String} or null - absolute url for a <link>, or null if the * rel type is not found. */ findAtomLink(linkRel, linkElements, baseURI) { if (!linkElements) { return null; } // XXX Need to check for MIME type and hreflang. for (let alink of linkElements) { if ( alink && // If there's a link rel. ((alink.getAttribute("rel") && alink.getAttribute("rel") == linkRel) || // If there isn't, assume 'alternate'. (!alink.getAttribute("rel") && linkRel == "alternate")) && alink.getAttribute("href") ) { // Atom links are interpreted relative to xml:base. let href = alink.getAttribute("href"); baseURI = alink.getAttribute("xml:base") || baseURI || href; try { return Services.io.newURI(baseURI).resolve(href); } catch (ex) {} } } return null; }, /** * Find RSS Syndication extension tags. * http://web.resource.org/rss/1.0/modules/syndication/ * * @param {Feed} aFeed - the feed object. * @param {Node | String} aChannel - dom node for the <channel>. * @returns {void} */ findSyUpdateTags(aFeed, aChannel) { let tag, updatePeriod, updateFrequency, updateBase; tag = this.childrenByTagNameNS( aChannel, lazy.FeedUtils.RSS_SY_NS, "updatePeriod" ); updatePeriod = this.getNodeValue(tag ? tag[0] : null) || ""; tag = this.childrenByTagNameNS( aChannel, lazy.FeedUtils.RSS_SY_NS, "updateFrequency" ); updateFrequency = this.getNodeValue(tag ? tag[0] : null) || ""; tag = this.childrenByTagNameNS( aChannel, lazy.FeedUtils.RSS_SY_NS, "updateBase" ); updateBase = this.getNodeValue(tag ? tag[0] : null) || ""; lazy.FeedUtils.log.debug( "FeedParser.findSyUpdateTags: updatePeriod:updateFrequency - " + updatePeriod + ":" + updateFrequency ); if (updatePeriod) { if (lazy.FeedUtils.RSS_SY_UNITS.includes(updatePeriod.toLowerCase())) { updatePeriod = updatePeriod.toLowerCase(); } else { updatePeriod = "daily"; } } updateFrequency = isNaN(updateFrequency) ? 1 : updateFrequency; let options = aFeed.options; if ( options.updates.updatePeriod == updatePeriod && options.updates.updateFrequency == updateFrequency && options.updates.updateBase == updateBase ) { return; } options.updates.updatePeriod = updatePeriod; options.updates.updateFrequency = updateFrequency; options.updates.updateBase = updateBase; aFeed.options = options; }, /** * Remove unprintable ascii, particularly CR/LF, for non formatted tag values. * * @param {string} s - String to clean. * @returns {String} - Cleaned string. */ removeUnprintableASCII(s) { /* eslint-disable-next-line no-control-regex */ return s ? s.replace(/[\x00-\x1F\x7F]+/g, "") : ""; }, /** * Remove unprintable ascii, except CR/LF/TAB, for formatted tag values. * * @param {string} s - String to clean. * @returns {String} - Cleaned string. */ removeUnprintableASCIIexCRLFTAB(s) { /* eslint-disable-next-line no-control-regex */ return s ? s.replace(/[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+/g, "") : ""; }, stripTags(someHTML) { return someHTML ? someHTML.replace(/<[^>]+>/g, "") : someHTML; }, xmlUnescape(s) { s = s.replace(/</g, "<"); s = s.replace(/>/g, ">"); s = s.replace(/&/g, "&"); return s; }, xmlEscape(s) { s = s.replace(/&/g, "&"); s = s.replace(/>/g, ">"); s = s.replace(/</g, "<"); return s; }, dateRescue(dateString) { // Deal with various kinds of invalid dates. if (!isNaN(parseInt(dateString))) { // It's an integer, so maybe it's a timestamp. let d = new Date(parseInt(dateString) * 1000); let now = new Date(); let yeardiff = now.getFullYear() - d.getFullYear(); lazy.FeedUtils.log.trace( "FeedParser.dateRescue: Rescue Timestamp date - " + d.toString() + " ,year diff - " + yeardiff ); if (yeardiff >= 0 && yeardiff < 3) { // It's quite likely the correct date. return d.toString(); } } // Could be an ISO8601/W3C date. If not, get the current time. return lazy.FeedUtils.getValidRFC5322Date(dateString); }, };