diff options
Diffstat (limited to '')
-rw-r--r-- | browser/components/pagedata/SchemaOrgPageData.sys.mjs | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/browser/components/pagedata/SchemaOrgPageData.sys.mjs b/browser/components/pagedata/SchemaOrgPageData.sys.mjs new file mode 100644 index 0000000000..449572c76f --- /dev/null +++ b/browser/components/pagedata/SchemaOrgPageData.sys.mjs @@ -0,0 +1,441 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { PageDataSchema } from "resource:///modules/pagedata/PageDataSchema.sys.mjs"; + +/** + * Represents an item from the schema.org specification. + * + * Every `Item` has a type and a set of properties. Each property has a string + * name and a list of values. It often isn't clear from the spec whether a + * property is expected to have a list of values or just one value so this + * data structure stores every property as a list and provides a simple method + * to get the first property value. + */ +class Item { + /** @type {string} The type of the item e.g. "Product" or "Person". */ + type; + + /** @type {Map<string, any[]>} Properties of the item. */ + properties = new Map(); + + /** + * Constructors a new `Item` of the given type. + * + * @param {string} type + * The type of the item. + */ + constructor(type) { + this.type = type; + } + + /** + * Tests whether a property has any values in this item. + * + * @param {string} prop + * The name of the property. + * @returns {boolean} + */ + has(prop) { + return this.properties.has(prop); + } + + /** + * Gets all of the values for a property. This may return an empty array if + * there are no values. + * + * @param {string} prop + * The name of the property. + * @returns {any[]} + */ + all(prop) { + return this.properties.get(prop) ?? []; + } + + /** + * Gets the first value for a property. + * + * @param {string} prop + * The name of the property. + * @returns {any} + */ + get(prop) { + return this.properties.get(prop)?.[0]; + } + + /** + * Sets a value for a property. + * + * @param {string} prop + * The name of the property. + * @param {any} value + * The value of the property. + */ + set(prop, value) { + let props = this.properties.get(prop); + if (props === undefined) { + props = []; + this.properties.set(prop, props); + } + + props.push(value); + } + + /** + * Converts this item to JSON-LD. + * + * Single array properties are converted into simple properties. + * + * @returns {object} + */ + toJsonLD() { + /** + * Converts a value to its JSON-LD representation. + * + * @param {any} val + * The value to convert. + * @returns {any} + */ + function toLD(val) { + if (val instanceof Item) { + return val.toJsonLD(); + } + return val; + } + + let props = Array.from(this.properties, ([key, value]) => { + if (value.length == 1) { + return [key, toLD(value[0])]; + } + + return [key, value.map(toLD)]; + }); + + return { + "@type": this.type, + ...Object.fromEntries(props), + }; + } +} + +/** + * Parses the value for a given microdata property. + * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec + * + * @param {Element} propElement + * The property element. + * @returns {any} + * The value of the property. + */ +function parseMicrodataProp(propElement) { + if (propElement.hasAttribute("itemscope")) { + throw new Error( + "Cannot parse a simple property value from an itemscope element." + ); + } + + const parseUrl = (urlElement, attr) => { + if (!urlElement.hasAttribute(attr)) { + return ""; + } + + try { + let url = new URL( + urlElement.getAttribute(attr), + urlElement.ownerDocument.documentURI + ); + return url.toString(); + } catch (e) { + return ""; + } + }; + + switch (propElement.localName) { + case "meta": + return propElement.getAttribute("content") ?? ""; + case "audio": + case "embed": + case "iframe": + case "source": + case "track": + case "video": + return parseUrl(propElement, "src"); + case "img": + // Some pages may be using a lazy loading approach to images, putting a + // temporary image in "src" while the real image is in a differently + // named attribute. So far we found "content" and "data-src" are common + // names for that attribute. + return ( + parseUrl(propElement, "content") || + parseUrl(propElement, "data-src") || + parseUrl(propElement, "src") + ); + case "object": + return parseUrl(propElement, "data"); + case "a": + case "area": + case "link": + return parseUrl(propElement, "href"); + case "data": + case "meter": + return propElement.getAttribute("value"); + case "time": + if (propElement.hasAtribute("datetime")) { + return propElement.getAttribute("datetime"); + } + return propElement.textContent; + default: + // Not mentioned in the spec but sites seem to use it. + if (propElement.hasAttribute("content")) { + return propElement.getAttribute("content"); + } + return propElement.textContent; + } +} + +/** + * Collects product data from an item. + * + * @param {Document} document + * The document the item comes from. + * @param {PageData} pageData + * The pageData object to add to. + * @param {Item} item + * The product item. + */ +function collectProduct(document, pageData, item) { + if (item.has("image")) { + let url = new URL(item.get("image"), document.documentURI); + pageData.image = url.toString(); + } + + if (item.has("description")) { + pageData.description = item.get("description"); + } + + pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = { + name: item.get("name"), + }; + + for (let offer of item.all("offers")) { + if (!(offer instanceof Item) || offer.type != "Offer") { + continue; + } + + let price = parseFloat(offer.get("price")); + if (!isNaN(price)) { + pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = { + value: price, + currency: offer.get("priceCurrency"), + }; + + break; + } + } +} + +/** + * Returns the root microdata items from the given document. + * + * @param {Document} document + * The DOM document to collect from. + * @returns {Item[]} + */ +function collectMicrodataItems(document) { + // First find all of the items in the document. + let itemElements = document.querySelectorAll( + "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']" + ); + + /** + * Maps elements to the closest item. + * + * @type {Map<Element, Item>} + */ + let items = new Map(); + + /** + * Finds the item for an element. Throws if there is no item. Caches the + * result. + * + * @param {Element} element + * The element to search from. + * @returns {Item} + */ + function itemFor(element) { + let item = items.get(element); + if (item) { + return item; + } + + if (!element.parentElement) { + throw new Error("Element has no parent item."); + } + + item = itemFor(element.parentElement); + items.set(element, item); + return item; + } + + for (let element of itemElements) { + let itemType = element.getAttribute("itemtype"); + // Strip off the base url + if (itemType.startsWith("https://")) { + itemType = itemType.substring(19); + } else { + itemType = itemType.substring(18); + } + + items.set(element, new Item(itemType)); + } + + // The initial roots are just all the items. + let roots = new Set(items.values()); + + // Now find all item properties. + let itemProps = document.querySelectorAll( + "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]" + ); + + for (let element of itemProps) { + // The item is always defined above the current element. + let item = itemFor(element.parentElement); + + // The properties value is either a nested item or a simple value. + let propValue = items.get(element) ?? parseMicrodataProp(element); + item.set(element.getAttribute("itemprop"), propValue); + + if (propValue instanceof Item) { + // This item belongs to another item and so is not a root item. + roots.delete(propValue); + } + } + + return [...roots]; +} + +/** + * Returns the root JSON-LD items from the given document. + * + * @param {Document} document + * The DOM document to collect from. + * @returns {Item[]} + */ +function collectJsonLDItems(document) { + /** + * The root items. + * + * @type {Item[]} + */ + let items = []; + + /** + * Converts a JSON-LD value into an Item if appropriate. + * + * @param {any} val + * The value to convert. + * @returns {any} + */ + function fromLD(val) { + if (typeof val == "object" && "@type" in val) { + let item = new Item(val["@type"]); + + for (let [prop, value] of Object.entries(val)) { + // Ignore meta properties. + if (prop.startsWith("@")) { + continue; + } + + if (!Array.isArray(value)) { + value = [value]; + } + + item.properties.set(prop, value.map(fromLD)); + } + + return item; + } + + return val; + } + + let scripts = document.querySelectorAll("script[type='application/ld+json'"); + for (let script of scripts) { + try { + let content = JSON.parse(script.textContent); + + if (typeof content != "object") { + continue; + } + + if (!("@context" in content)) { + continue; + } + + if ( + content["@context"] != "http://schema.org" && + content["@context"] != "https://schema.org" + ) { + continue; + } + + let item = fromLD(content); + if (item instanceof Item) { + items.push(item); + } + } catch (e) { + // Unparsable content. + } + } + + return items; +} + +/** + * Collects schema.org related data from a page. + * + * Currently only supports HTML Microdata and JSON-LD formats, not RDFa. + */ +export const SchemaOrgPageData = { + /** + * Parses and collects the schema.org items from the given document. + * The returned items are the roots, i.e. the top-level items, there may be + * other items as nested properties. + * + * @param {Document} document + * The DOM document to parse. + * @returns {Item[]} + */ + collectItems(document) { + return collectMicrodataItems(document).concat(collectJsonLDItems(document)); + }, + + /** + * Performs PageData collection from the given document. + * + * @param {Document} document + * The DOM document to collect from. + * @returns {PageData} + */ + collect(document) { + let pageData = { data: {} }; + + let items = this.collectItems(document); + + for (let item of items) { + switch (item.type) { + case "Product": + if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) { + collectProduct(document, pageData, item); + } + break; + case "Organization": + pageData.siteName = item.get("name"); + break; + } + } + + return pageData; + }, +}; |