summaryrefslogtreecommitdiffstats
path: root/browser/components/pagedata/SchemaOrgPageData.sys.mjs
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--browser/components/pagedata/SchemaOrgPageData.sys.mjs441
1 files changed, 441 insertions, 0 deletions
diff --git a/browser/components/pagedata/SchemaOrgPageData.sys.mjs b/browser/components/pagedata/SchemaOrgPageData.sys.mjs
new file mode 100644
index 0000000000..449572c76f
--- /dev/null
+++ b/browser/components/pagedata/SchemaOrgPageData.sys.mjs
@@ -0,0 +1,441 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { PageDataSchema } from "resource:///modules/pagedata/PageDataSchema.sys.mjs";
+
+/**
+ * Represents an item from the schema.org specification.
+ *
+ * Every `Item` has a type and a set of properties. Each property has a string
+ * name and a list of values. It often isn't clear from the spec whether a
+ * property is expected to have a list of values or just one value so this
+ * data structure stores every property as a list and provides a simple method
+ * to get the first property value.
+ */
+class Item {
+ /** @type {string} The type of the item e.g. "Product" or "Person". */
+ type;
+
+ /** @type {Map<string, any[]>} Properties of the item. */
+ properties = new Map();
+
+ /**
+ * Constructors a new `Item` of the given type.
+ *
+ * @param {string} type
+ * The type of the item.
+ */
+ constructor(type) {
+ this.type = type;
+ }
+
+ /**
+ * Tests whether a property has any values in this item.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @returns {boolean}
+ */
+ has(prop) {
+ return this.properties.has(prop);
+ }
+
+ /**
+ * Gets all of the values for a property. This may return an empty array if
+ * there are no values.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @returns {any[]}
+ */
+ all(prop) {
+ return this.properties.get(prop) ?? [];
+ }
+
+ /**
+ * Gets the first value for a property.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @returns {any}
+ */
+ get(prop) {
+ return this.properties.get(prop)?.[0];
+ }
+
+ /**
+ * Sets a value for a property.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @param {any} value
+ * The value of the property.
+ */
+ set(prop, value) {
+ let props = this.properties.get(prop);
+ if (props === undefined) {
+ props = [];
+ this.properties.set(prop, props);
+ }
+
+ props.push(value);
+ }
+
+ /**
+ * Converts this item to JSON-LD.
+ *
+ * Single array properties are converted into simple properties.
+ *
+ * @returns {object}
+ */
+ toJsonLD() {
+ /**
+ * Converts a value to its JSON-LD representation.
+ *
+ * @param {any} val
+ * The value to convert.
+ * @returns {any}
+ */
+ function toLD(val) {
+ if (val instanceof Item) {
+ return val.toJsonLD();
+ }
+ return val;
+ }
+
+ let props = Array.from(this.properties, ([key, value]) => {
+ if (value.length == 1) {
+ return [key, toLD(value[0])];
+ }
+
+ return [key, value.map(toLD)];
+ });
+
+ return {
+ "@type": this.type,
+ ...Object.fromEntries(props),
+ };
+ }
+}
+
+/**
+ * Parses the value for a given microdata property.
+ * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
+ *
+ * @param {Element} propElement
+ * The property element.
+ * @returns {any}
+ * The value of the property.
+ */
+function parseMicrodataProp(propElement) {
+ if (propElement.hasAttribute("itemscope")) {
+ throw new Error(
+ "Cannot parse a simple property value from an itemscope element."
+ );
+ }
+
+ const parseUrl = (urlElement, attr) => {
+ if (!urlElement.hasAttribute(attr)) {
+ return "";
+ }
+
+ try {
+ let url = new URL(
+ urlElement.getAttribute(attr),
+ urlElement.ownerDocument.documentURI
+ );
+ return url.toString();
+ } catch (e) {
+ return "";
+ }
+ };
+
+ switch (propElement.localName) {
+ case "meta":
+ return propElement.getAttribute("content") ?? "";
+ case "audio":
+ case "embed":
+ case "iframe":
+ case "source":
+ case "track":
+ case "video":
+ return parseUrl(propElement, "src");
+ case "img":
+ // Some pages may be using a lazy loading approach to images, putting a
+ // temporary image in "src" while the real image is in a differently
+ // named attribute. So far we found "content" and "data-src" are common
+ // names for that attribute.
+ return (
+ parseUrl(propElement, "content") ||
+ parseUrl(propElement, "data-src") ||
+ parseUrl(propElement, "src")
+ );
+ case "object":
+ return parseUrl(propElement, "data");
+ case "a":
+ case "area":
+ case "link":
+ return parseUrl(propElement, "href");
+ case "data":
+ case "meter":
+ return propElement.getAttribute("value");
+ case "time":
+ if (propElement.hasAtribute("datetime")) {
+ return propElement.getAttribute("datetime");
+ }
+ return propElement.textContent;
+ default:
+ // Not mentioned in the spec but sites seem to use it.
+ if (propElement.hasAttribute("content")) {
+ return propElement.getAttribute("content");
+ }
+ return propElement.textContent;
+ }
+}
+
+/**
+ * Collects product data from an item.
+ *
+ * @param {Document} document
+ * The document the item comes from.
+ * @param {PageData} pageData
+ * The pageData object to add to.
+ * @param {Item} item
+ * The product item.
+ */
+function collectProduct(document, pageData, item) {
+ if (item.has("image")) {
+ let url = new URL(item.get("image"), document.documentURI);
+ pageData.image = url.toString();
+ }
+
+ if (item.has("description")) {
+ pageData.description = item.get("description");
+ }
+
+ pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = {
+ name: item.get("name"),
+ };
+
+ for (let offer of item.all("offers")) {
+ if (!(offer instanceof Item) || offer.type != "Offer") {
+ continue;
+ }
+
+ let price = parseFloat(offer.get("price"));
+ if (!isNaN(price)) {
+ pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = {
+ value: price,
+ currency: offer.get("priceCurrency"),
+ };
+
+ break;
+ }
+ }
+}
+
+/**
+ * Returns the root microdata items from the given document.
+ *
+ * @param {Document} document
+ * The DOM document to collect from.
+ * @returns {Item[]}
+ */
+function collectMicrodataItems(document) {
+ // First find all of the items in the document.
+ let itemElements = document.querySelectorAll(
+ "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
+ );
+
+ /**
+ * Maps elements to the closest item.
+ *
+ * @type {Map<Element, Item>}
+ */
+ let items = new Map();
+
+ /**
+ * Finds the item for an element. Throws if there is no item. Caches the
+ * result.
+ *
+ * @param {Element} element
+ * The element to search from.
+ * @returns {Item}
+ */
+ function itemFor(element) {
+ let item = items.get(element);
+ if (item) {
+ return item;
+ }
+
+ if (!element.parentElement) {
+ throw new Error("Element has no parent item.");
+ }
+
+ item = itemFor(element.parentElement);
+ items.set(element, item);
+ return item;
+ }
+
+ for (let element of itemElements) {
+ let itemType = element.getAttribute("itemtype");
+ // Strip off the base url
+ if (itemType.startsWith("https://")) {
+ itemType = itemType.substring(19);
+ } else {
+ itemType = itemType.substring(18);
+ }
+
+ items.set(element, new Item(itemType));
+ }
+
+ // The initial roots are just all the items.
+ let roots = new Set(items.values());
+
+ // Now find all item properties.
+ let itemProps = document.querySelectorAll(
+ "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]"
+ );
+
+ for (let element of itemProps) {
+ // The item is always defined above the current element.
+ let item = itemFor(element.parentElement);
+
+ // The properties value is either a nested item or a simple value.
+ let propValue = items.get(element) ?? parseMicrodataProp(element);
+ item.set(element.getAttribute("itemprop"), propValue);
+
+ if (propValue instanceof Item) {
+ // This item belongs to another item and so is not a root item.
+ roots.delete(propValue);
+ }
+ }
+
+ return [...roots];
+}
+
+/**
+ * Returns the root JSON-LD items from the given document.
+ *
+ * @param {Document} document
+ * The DOM document to collect from.
+ * @returns {Item[]}
+ */
+function collectJsonLDItems(document) {
+ /**
+ * The root items.
+ *
+ * @type {Item[]}
+ */
+ let items = [];
+
+ /**
+ * Converts a JSON-LD value into an Item if appropriate.
+ *
+ * @param {any} val
+ * The value to convert.
+ * @returns {any}
+ */
+ function fromLD(val) {
+ if (typeof val == "object" && "@type" in val) {
+ let item = new Item(val["@type"]);
+
+ for (let [prop, value] of Object.entries(val)) {
+ // Ignore meta properties.
+ if (prop.startsWith("@")) {
+ continue;
+ }
+
+ if (!Array.isArray(value)) {
+ value = [value];
+ }
+
+ item.properties.set(prop, value.map(fromLD));
+ }
+
+ return item;
+ }
+
+ return val;
+ }
+
+ let scripts = document.querySelectorAll("script[type='application/ld+json'");
+ for (let script of scripts) {
+ try {
+ let content = JSON.parse(script.textContent);
+
+ if (typeof content != "object") {
+ continue;
+ }
+
+ if (!("@context" in content)) {
+ continue;
+ }
+
+ if (
+ content["@context"] != "http://schema.org" &&
+ content["@context"] != "https://schema.org"
+ ) {
+ continue;
+ }
+
+ let item = fromLD(content);
+ if (item instanceof Item) {
+ items.push(item);
+ }
+ } catch (e) {
+ // Unparsable content.
+ }
+ }
+
+ return items;
+}
+
+/**
+ * Collects schema.org related data from a page.
+ *
+ * Currently only supports HTML Microdata and JSON-LD formats, not RDFa.
+ */
+export const SchemaOrgPageData = {
+ /**
+ * Parses and collects the schema.org items from the given document.
+ * The returned items are the roots, i.e. the top-level items, there may be
+ * other items as nested properties.
+ *
+ * @param {Document} document
+ * The DOM document to parse.
+ * @returns {Item[]}
+ */
+ collectItems(document) {
+ return collectMicrodataItems(document).concat(collectJsonLDItems(document));
+ },
+
+ /**
+ * Performs PageData collection from the given document.
+ *
+ * @param {Document} document
+ * The DOM document to collect from.
+ * @returns {PageData}
+ */
+ collect(document) {
+ let pageData = { data: {} };
+
+ let items = this.collectItems(document);
+
+ for (let item of items) {
+ switch (item.type) {
+ case "Product":
+ if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) {
+ collectProduct(document, pageData, item);
+ }
+ break;
+ case "Organization":
+ pageData.siteName = item.get("name");
+ break;
+ }
+ }
+
+ return pageData;
+ },
+};