diff options
Diffstat (limited to 'browser/components/pagedata/PageDataSchema.sys.mjs')
-rw-r--r-- | browser/components/pagedata/PageDataSchema.sys.mjs | 254 |
1 files changed, 254 insertions, 0 deletions
diff --git a/browser/components/pagedata/PageDataSchema.sys.mjs b/browser/components/pagedata/PageDataSchema.sys.mjs new file mode 100644 index 0000000000..14eb681ccc --- /dev/null +++ b/browser/components/pagedata/PageDataSchema.sys.mjs @@ -0,0 +1,254 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.sys.mjs", + SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs", + TwitterPageData: "resource:///modules/pagedata/TwitterPageData.sys.mjs", +}); + +XPCOMUtils.defineLazyModuleGetters(lazy, { + JsonSchemaValidator: + "resource://gre/modules/components-utils/JsonSchemaValidator.jsm", +}); + +XPCOMUtils.defineLazyGetter(lazy, "logConsole", function() { + return console.createInstance({ + prefix: "PageData", + maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false) + ? "Debug" + : "Warn", + }); +}); + +/** + * The list of page data collectors. These should be sorted in order of + * specificity, if the same piece of data is provided by two collectors then the + * earlier wins. + * + * Collectors must provide a `collect` function which will be passed the + * document object and should return the PageData structure. The function may be + * asynchronous if needed. + * + * The data returned need not be valid, collectors should return whatever they + * can and then we drop anything that is invalid once all data is joined. + */ +XPCOMUtils.defineLazyGetter(lazy, "DATA_COLLECTORS", function() { + return [lazy.SchemaOrgPageData, lazy.OpenGraphPageData, lazy.TwitterPageData]; +}); + +let SCHEMAS = new Map(); + +/** + * Loads the schema for the given name. + * + * @param {string} schemaName + * The name of the schema to load. + */ +async function loadSchema(schemaName) { + if (SCHEMAS.has(schemaName)) { + return SCHEMAS.get(schemaName); + } + + let url = `chrome://browser/content/pagedata/schemas/${schemaName.toLocaleLowerCase()}.schema.json`; + let response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to load schema: ${response.statusText}`); + } + + let schema = await response.json(); + SCHEMAS.set(schemaName, schema); + return schema; +} + +/** + * Validates the data using the schema with the given name. + * + * @param {string} schemaName + * The name of the schema to validate against. + * @param {object} data + * The data to validate. + */ +async function validateData(schemaName, data) { + let schema = await loadSchema(schemaName.toLocaleLowerCase()); + + let result = lazy.JsonSchemaValidator.validate(data, schema, { + allowExplicitUndefinedProperties: true, + // Allowed for future expansion of the schema. + allowExtraProperties: true, + }); + + if (!result.valid) { + throw result.error; + } +} + +/** + * A shared API that can be used in parent or child processes + */ +export const PageDataSchema = { + // Enumeration of data types. The keys must match the schema name. + DATA_TYPE: Object.freeze({ + // Note that 1 and 2 were used as types in earlier versions and should not be used here. + PRODUCT: 3, + DOCUMENT: 4, + ARTICLE: 5, + AUDIO: 6, + VIDEO: 7, + }), + + /** + * Gets the data type name. + * + * @param {DATA_TYPE} type + * The data type from the DATA_TYPE enumeration + * + * @returns {string | null} The name for the type or null if not found. + */ + nameForType(type) { + for (let [name, value] of Object.entries(this.DATA_TYPE)) { + if (value == type) { + return name; + } + } + + return null; + }, + + /** + * Asynchronously validates some page data against the expected schema. Throws + * an exception if validation fails. + * + * @param {DATA_TYPE} type + * The data type from the DATA_TYPE enumeration + * @param {object} data + * The page data + */ + async validateData(type, data) { + let name = this.nameForType(type); + + if (!name) { + throw new Error(`Unknown data type ${type}`); + } + + return validateData(name, data); + }, + + /** + * Asynchronously validates an entire PageData structure. Any invalid or + * unknown data types are dropped. + * + * @param {PageData} pageData + * The page data + * + * @returns {PageData} The validated page data structure + */ + async validatePageData(pageData) { + let { data: dataMap = {}, ...general } = pageData; + + await validateData("general", general); + + let validData = {}; + + for (let [type, data] of Object.entries(dataMap)) { + let name = this.nameForType(type); + // Ignore unknown types here. + if (!name) { + continue; + } + + try { + await validateData(name, data); + + validData[type] = data; + } catch (e) { + // Invalid data is dropped. + } + } + + return { + ...general, + data: validData, + }; + }, + + /** + * Adds new page data into an existing data set. Any existing data is not + * overwritten. + * + * @param {PageData} existingPageData + * The existing page data + * @param {PageData} newPageData + * The new page data + * + * @returns {PageData} The joined data. + */ + coalescePageData(existingPageData, newPageData) { + // Split out the general data from the map of specific data. + let { data: existingMap = {}, ...existingGeneral } = existingPageData; + let { data: newMap = {}, ...newGeneral } = newPageData; + + Object.assign(newGeneral, existingGeneral); + + let dataMap = {}; + for (let [type, data] of Object.entries(existingMap)) { + if (type in newMap) { + dataMap[type] = Object.assign({}, newMap[type], data); + } else { + dataMap[type] = data; + } + } + + for (let [type, data] of Object.entries(newMap)) { + if (!(type in dataMap)) { + dataMap[type] = data; + } + } + + return { + ...newGeneral, + data: dataMap, + }; + }, + + /** + * Collects page data from a DOM document. + * + * @param {Document} document + * The DOM document to collect data from + * + * @returns {Promise<PageData | null>} The data collected or null in case of + * error. + */ + async collectPageData(document) { + lazy.logConsole.debug("Starting collection", document.documentURI); + + let pending = lazy.DATA_COLLECTORS.map(async collector => { + try { + return await collector.collect(document); + } catch (e) { + lazy.logConsole.error("Error collecting page data", e); + return null; + } + }); + + let pageDataList = await Promise.all(pending); + + let pageData = pageDataList.reduce(PageDataSchema.coalescePageData, { + date: Date.now(), + url: document.documentURI, + }); + + try { + return this.validatePageData(pageData); + } catch (e) { + lazy.logConsole.error("Failed to collect valid page data", e); + return null; + } + }, +}; |