diff options
Diffstat (limited to 'browser/components/pagedata')
31 files changed, 3667 insertions, 0 deletions
diff --git a/browser/components/pagedata/.eslintrc.js b/browser/components/pagedata/.eslintrc.js new file mode 100644 index 0000000000..8ead689bcc --- /dev/null +++ b/browser/components/pagedata/.eslintrc.js @@ -0,0 +1,14 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +module.exports = { + extends: ["plugin:mozilla/require-jsdoc"], + + rules: { + "mozilla/var-only-at-top-level": "error", + "no-unused-expressions": "error", + }, +}; diff --git a/browser/components/pagedata/OpenGraphPageData.sys.mjs b/browser/components/pagedata/OpenGraphPageData.sys.mjs new file mode 100644 index 0000000000..8f8b361799 --- /dev/null +++ b/browser/components/pagedata/OpenGraphPageData.sys.mjs @@ -0,0 +1,46 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Collects Open Graph (https://opengraphprotocol.org/) related data from a page. + */ +export const OpenGraphPageData = { + /** + * Collects the opengraph data from the page. + * + * @param {Document} document + * The document to collect from + * + * @returns {PageData} + */ + collect(document) { + let pageData = {}; + + // Sites can technically define an Open Graph prefix other than `og:`. + // However, `og:` is one of the default RDFa prefixes and it's likely + // uncommon that sites use a custom prefix. If we find that metadata is + // missing for common sites due to this issue, we could consider adding a + // basic RDFa parser. + let openGraphTags = document.querySelectorAll("meta[property^='og:'"); + + for (let tag of openGraphTags) { + // Strip "og:" from the property name. + let propertyName = tag.getAttribute("property").substring(3); + + switch (propertyName) { + case "description": + pageData.description = tag.getAttribute("content"); + break; + case "site_name": + pageData.siteName = tag.getAttribute("content"); + break; + case "image": + pageData.image = tag.getAttribute("content"); + break; + } + } + + return pageData; + }, +}; diff --git a/browser/components/pagedata/PageDataChild.sys.mjs b/browser/components/pagedata/PageDataChild.sys.mjs new file mode 100644 index 0000000000..51dc384526 --- /dev/null +++ b/browser/components/pagedata/PageDataChild.sys.mjs @@ -0,0 +1,121 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs", + PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs", +}); + +// We defer any attempt to check for page data for a short time after a page +// loads to allow JS to operate. +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "READY_DELAY", + "browser.pagedata.readyDelay", + 500 +); + +/** + * The actor responsible for monitoring a page for page data. + */ +export class PageDataChild extends JSWindowActorChild { + #isContentWindowPrivate = true; + /** + * Used to debounce notifications about a page being ready. + * + * @type {Timer | null} + */ + #deferTimer = null; + + /** + * Called when the actor is created for a new page. + */ + actorCreated() { + this.#isContentWindowPrivate = + lazy.PrivateBrowsingUtils.isContentWindowPrivate(this.contentWindow); + } + + /** + * Called when the page is destroyed. + */ + didDestroy() { + if (this.#deferTimer) { + this.#deferTimer.cancel(); + } + } + + /** + * Called when the page has signalled it is done loading. This signal is + * debounced by READY_DELAY. + */ + #deferReady() { + if (!this.#deferTimer) { + this.#deferTimer = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer); + } + + // If the timer was already running this re-starts it. + this.#deferTimer.initWithCallback( + () => { + this.#deferTimer = null; + this.sendAsyncMessage("PageData:DocumentReady", { + url: this.document.documentURI, + }); + }, + lazy.READY_DELAY, + Ci.nsITimer.TYPE_ONE_SHOT_LOW_PRIORITY + ); + } + + /** + * Called when a message is received from the parent process. + * + * @param {ReceiveMessageArgument} msg + * The received message. + * + * @returns {Promise | undefined} + * A promise for the requested data or undefined if no data was requested. + */ + receiveMessage(msg) { + if (this.#isContentWindowPrivate) { + return undefined; + } + + switch (msg.name) { + case "PageData:CheckLoaded": + // The service just started in the parent. Check if this document is + // already loaded. + if (this.document.readystate == "complete") { + this.#deferReady(); + } + break; + case "PageData:Collect": + return lazy.PageDataSchema.collectPageData(this.document); + } + + return undefined; + } + + /** + * DOM event handler. + * + * @param {Event} event + * The DOM event. + */ + handleEvent(event) { + if (this.#isContentWindowPrivate) { + return; + } + + switch (event.type) { + case "DOMContentLoaded": + case "pageshow": + this.#deferReady(); + break; + } + } +} diff --git a/browser/components/pagedata/PageDataParent.sys.mjs b/browser/components/pagedata/PageDataParent.sys.mjs new file mode 100644 index 0000000000..25295adeca --- /dev/null +++ b/browser/components/pagedata/PageDataParent.sys.mjs @@ -0,0 +1,56 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs", +}); + +/** + * Receives messages from PageDataChild and passes them to the PageData service. + */ +export class PageDataParent extends JSWindowActorParent { + #deferredCollection = null; + + /** + * Starts data collection in the child process. Returns a promise that + * resolves to the page data or null if the page is closed before data + * collection completes. + * + * @returns {Promise<PageData|null>} + */ + collectPageData() { + if (!this.#deferredCollection) { + this.#deferredCollection = Promise.withResolvers(); + this.sendQuery("PageData:Collect").then( + this.#deferredCollection.resolve, + this.#deferredCollection.reject + ); + } + + return this.#deferredCollection.promise; + } + + /** + * Called when the page is destroyed. + */ + didDestroy() { + this.#deferredCollection?.resolve(null); + } + + /** + * Called when a message is received from the content process. + * + * @param {ReceiveMessageArgument} msg + * The received message. + */ + receiveMessage(msg) { + switch (msg.name) { + case "PageData:DocumentReady": + lazy.PageDataService.pageLoaded(this, msg.data.url); + break; + } + } +} diff --git a/browser/components/pagedata/PageDataSchema.sys.mjs b/browser/components/pagedata/PageDataSchema.sys.mjs new file mode 100644 index 0000000000..ef3907325b --- /dev/null +++ b/browser/components/pagedata/PageDataSchema.sys.mjs @@ -0,0 +1,249 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + JsonSchemaValidator: + "resource://gre/modules/components-utils/JsonSchemaValidator.sys.mjs", + OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.sys.mjs", + SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs", + TwitterPageData: "resource:///modules/pagedata/TwitterPageData.sys.mjs", +}); + +ChromeUtils.defineLazyGetter(lazy, "logConsole", function () { + return console.createInstance({ + prefix: "PageData", + maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false) + ? "Debug" + : "Warn", + }); +}); + +/** + * The list of page data collectors. These should be sorted in order of + * specificity, if the same piece of data is provided by two collectors then the + * earlier wins. + * + * Collectors must provide a `collect` function which will be passed the + * document object and should return the PageData structure. The function may be + * asynchronous if needed. + * + * The data returned need not be valid, collectors should return whatever they + * can and then we drop anything that is invalid once all data is joined. + */ +ChromeUtils.defineLazyGetter(lazy, "DATA_COLLECTORS", function () { + return [lazy.SchemaOrgPageData, lazy.OpenGraphPageData, lazy.TwitterPageData]; +}); + +let SCHEMAS = new Map(); + +/** + * Loads the schema for the given name. + * + * @param {string} schemaName + * The name of the schema to load. + */ +async function loadSchema(schemaName) { + if (SCHEMAS.has(schemaName)) { + return SCHEMAS.get(schemaName); + } + + let url = `chrome://browser/content/pagedata/schemas/${schemaName.toLocaleLowerCase()}.schema.json`; + let response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to load schema: ${response.statusText}`); + } + + let schema = await response.json(); + SCHEMAS.set(schemaName, schema); + return schema; +} + +/** + * Validates the data using the schema with the given name. + * + * @param {string} schemaName + * The name of the schema to validate against. + * @param {object} data + * The data to validate. + */ +async function validateData(schemaName, data) { + let schema = await loadSchema(schemaName.toLocaleLowerCase()); + + let result = lazy.JsonSchemaValidator.validate(data, schema, { + allowExplicitUndefinedProperties: true, + // Allowed for future expansion of the schema. + allowAdditionalProperties: true, + }); + + if (!result.valid) { + throw result.error; + } +} + +/** + * A shared API that can be used in parent or child processes + */ +export const PageDataSchema = { + // Enumeration of data types. The keys must match the schema name. + DATA_TYPE: Object.freeze({ + // Note that 1 and 2 were used as types in earlier versions and should not be used here. + PRODUCT: 3, + DOCUMENT: 4, + ARTICLE: 5, + AUDIO: 6, + VIDEO: 7, + }), + + /** + * Gets the data type name. + * + * @param {DATA_TYPE} type + * The data type from the DATA_TYPE enumeration + * + * @returns {string | null} The name for the type or null if not found. + */ + nameForType(type) { + for (let [name, value] of Object.entries(this.DATA_TYPE)) { + if (value == type) { + return name; + } + } + + return null; + }, + + /** + * Asynchronously validates some page data against the expected schema. Throws + * an exception if validation fails. + * + * @param {DATA_TYPE} type + * The data type from the DATA_TYPE enumeration + * @param {object} data + * The page data + */ + async validateData(type, data) { + let name = this.nameForType(type); + + if (!name) { + throw new Error(`Unknown data type ${type}`); + } + + return validateData(name, data); + }, + + /** + * Asynchronously validates an entire PageData structure. Any invalid or + * unknown data types are dropped. + * + * @param {PageData} pageData + * The page data + * + * @returns {PageData} The validated page data structure + */ + async validatePageData(pageData) { + let { data: dataMap = {}, ...general } = pageData; + + await validateData("general", general); + + let validData = {}; + + for (let [type, data] of Object.entries(dataMap)) { + let name = this.nameForType(type); + // Ignore unknown types here. + if (!name) { + continue; + } + + try { + await validateData(name, data); + + validData[type] = data; + } catch (e) { + // Invalid data is dropped. + } + } + + return { + ...general, + data: validData, + }; + }, + + /** + * Adds new page data into an existing data set. Any existing data is not + * overwritten. + * + * @param {PageData} existingPageData + * The existing page data + * @param {PageData} newPageData + * The new page data + * + * @returns {PageData} The joined data. + */ + coalescePageData(existingPageData, newPageData) { + // Split out the general data from the map of specific data. + let { data: existingMap = {}, ...existingGeneral } = existingPageData; + let { data: newMap = {}, ...newGeneral } = newPageData; + + Object.assign(newGeneral, existingGeneral); + + let dataMap = {}; + for (let [type, data] of Object.entries(existingMap)) { + if (type in newMap) { + dataMap[type] = Object.assign({}, newMap[type], data); + } else { + dataMap[type] = data; + } + } + + for (let [type, data] of Object.entries(newMap)) { + if (!(type in dataMap)) { + dataMap[type] = data; + } + } + + return { + ...newGeneral, + data: dataMap, + }; + }, + + /** + * Collects page data from a DOM document. + * + * @param {Document} document + * The DOM document to collect data from + * + * @returns {Promise<PageData | null>} The data collected or null in case of + * error. + */ + async collectPageData(document) { + lazy.logConsole.debug("Starting collection", document.documentURI); + + let pending = lazy.DATA_COLLECTORS.map(async collector => { + try { + return await collector.collect(document); + } catch (e) { + lazy.logConsole.error("Error collecting page data", e); + return null; + } + }); + + let pageDataList = await Promise.all(pending); + + let pageData = pageDataList.reduce(PageDataSchema.coalescePageData, { + date: Date.now(), + url: document.documentURI, + }); + + try { + return this.validatePageData(pageData); + } catch (e) { + lazy.logConsole.error("Failed to collect valid page data", e); + return null; + } + }, +}; diff --git a/browser/components/pagedata/PageDataService.sys.mjs b/browser/components/pagedata/PageDataService.sys.mjs new file mode 100644 index 0000000000..7160705c27 --- /dev/null +++ b/browser/components/pagedata/PageDataService.sys.mjs @@ -0,0 +1,677 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +import { EventEmitter } from "resource://gre/modules/EventEmitter.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.sys.mjs", + E10SUtils: "resource://gre/modules/E10SUtils.sys.mjs", + HiddenFrame: "resource://gre/modules/HiddenFrame.sys.mjs", +}); + +ChromeUtils.defineLazyGetter(lazy, "logConsole", function () { + return console.createInstance({ + prefix: "PageData", + maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false) + ? "Debug" + : "Warn", + }); +}); + +XPCOMUtils.defineLazyServiceGetters(lazy, { + idleService: ["@mozilla.org/widget/useridleservice;1", "nsIUserIdleService"], +}); + +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "fetchIdleTime", + "browser.pagedata.fetchIdleTime", + 300 +); + +const ALLOWED_SCHEMES = ["http", "https", "data", "blob"]; + +const BACKGROUND_WIDTH = 1024; +const BACKGROUND_HEIGHT = 768; + +/** + * Shifts the first element out of the set. + * + * @param {Set<T>} set + * The set containing elements. + * @returns {T | undefined} The first element in the set or undefined if + * there is nothing in the set. + */ +function shift(set) { + let iter = set.values(); + let { value, done } = iter.next(); + + if (done) { + return undefined; + } + + set.delete(value); + return value; +} + +/** + * A manager for hidden browsers. Responsible for creating and destroying a + * hidden frame to hold them. + */ +class HiddenBrowserManager { + /** + * The hidden frame if one has been created. + * + * @type {HiddenFrame | null} + */ + #frame = null; + /** + * The number of hidden browser elements currently in use. + * + * @type {number} + */ + #browsers = 0; + + /** + * Creates and returns a new hidden browser. + * + * @returns {Browser} + */ + async #acquireBrowser() { + this.#browsers++; + if (!this.#frame) { + this.#frame = new lazy.HiddenFrame(); + } + + let frame = await this.#frame.get(); + let doc = frame.document; + let browser = doc.createXULElement("browser"); + browser.setAttribute("remote", "true"); + browser.setAttribute("type", "content"); + browser.setAttribute( + "style", + ` + width: ${BACKGROUND_WIDTH}px; + min-width: ${BACKGROUND_WIDTH}px; + height: ${BACKGROUND_HEIGHT}px; + min-height: ${BACKGROUND_HEIGHT}px; + ` + ); + browser.setAttribute("maychangeremoteness", "true"); + doc.documentElement.appendChild(browser); + + return browser; + } + + /** + * Releases the given hidden browser. + * + * @param {Browser} browser + * The hidden browser element. + */ + #releaseBrowser(browser) { + browser.remove(); + + this.#browsers--; + if (this.#browsers == 0) { + this.#frame.destroy(); + this.#frame = null; + } + } + + /** + * Calls a callback function with a new hidden browser. + * This function will return whatever the callback function returns. + * + * @param {Callback} callback + * The callback function will be called with the browser element and may + * be asynchronous. + * @returns {T} + */ + async withHiddenBrowser(callback) { + let browser = await this.#acquireBrowser(); + try { + return await callback(browser); + } finally { + this.#releaseBrowser(browser); + } + } +} + +/** + * @typedef {object} CacheEntry + * An entry in the page data cache. + * @property {PageData | null} pageData + * The data or null if there is no known data. + * @property {Set} actors + * The actors that maintain an interest in keeping the entry cached. + */ + +/** + * A cache of page data kept in memory. By default any discovered data from + * browsers is kept in memory until the browser element is destroyed but other + * actors may register an interest in keeping an entry alive beyond that. + */ +class PageDataCache { + /** + * The contents of the cache. Keyed on page url. + * + * @type {Map<string, CacheEntry>} + */ + #cache = new Map(); + + /** + * Creates or updates an entry in the cache. If no actor has registered any + * interest in keeping this page's data in memory then this will do nothing. + * + * @param {string} url + * The url of the page. + * @param {PageData|null} pageData + * The current page data for the page. + */ + set(url, pageData) { + let entry = this.#cache.get(url); + + if (entry) { + entry.pageData = pageData; + } + } + + /** + * Gets any cached data for the url. + * + * @param {string} url + * The url of the page. + * @returns {PageData | null} + * The page data if some is known. + */ + get(url) { + let entry = this.#cache.get(url); + return entry?.pageData ?? null; + } + + /** + * Adds a lock to an entry. This can be called before we have discovered the + * data for the url. + * + * @param {object} actor + * Ensures the entry stays in memory until unlocked by this actor. + * @param {string} url + * The url of the page. + */ + lockData(actor, url) { + let entry = this.#cache.get(url); + if (entry) { + entry.actors.add(actor); + } else { + this.#cache.set(url, { + pageData: undefined, + actors: new Set([actor]), + }); + } + } + + /** + * Removes a lock from an entry. + * + * @param {object} actor + * The lock to remove. + * @param {string | undefined} [url] + * The url of the page or undefined to unlock all urls locked by this actor. + */ + unlockData(actor, url) { + let entries = []; + if (url) { + let entry = this.#cache.get(url); + if (!entry) { + return; + } + + entries.push([url, entry]); + } else { + entries = [...this.#cache]; + } + + for (let [entryUrl, entry] of entries) { + if (entry.actors.delete(actor)) { + if (entry.actors.size == 0) { + this.#cache.delete(entryUrl); + } + } + } + } +} + +/** + * @typedef {object} PageData + * A set of discovered from a page. Other than the `data` property this is the + * schema at `browser/components/pagedata/schemas/general.schema.json`. + * @property {string} url + * The page's url. + * @property {number} date + * The epoch based timestamp for when the data was discovered. + * @property {string} siteName + * The page's friendly site name. + * @property {string} image + * The page's image. + * @property {object} data + * The map of data found which may be empty if no data was found. The key in + * map is from the `PageDataSchema.DATA_TYPE` enumeration. The values are in + * the format defined by the schemas at `browser/components/pagedata/schemas`. + */ + +export const PageDataService = new (class PageDataService extends EventEmitter { + /** + * Caches page data discovered from browsers. + * + * @type {PageDataCache} + */ + #pageDataCache = new PageDataCache(); + + /** + * The number of currently running background fetches. + * + * @type {number} + */ + #backgroundFetches = 0; + + /** + * The list of urls waiting to be loaded in the background. + * + * @type {Set<string>} + */ + #backgroundQueue = new Set(); + + /** + * Tracks whether the user is currently idle. + * + * @type {boolean} + */ + #userIsIdle = false; + + /** + * A manager for hidden browsers. + * + * @type {HiddenBrowserManager} + */ + #browserManager = new HiddenBrowserManager(); + + /** + * A map of hidden browsers to a resolve function that should be passed the + * actor that was created for the browser. + * + * @type {WeakMap<Browser, function(PageDataParent): void>} + */ + #backgroundBrowsers = new WeakMap(); + + /** + * Tracks windows that have browsers with entries in the cache. + * + * @type {Map<Window, Set<Browser>>} + */ + #trackedWindows = new Map(); + + /** + * Constructs the service. + */ + constructor() { + super(); + + // Limits the number of background fetches that will run at once. Set to 0 to + // effectively allow an infinite number. + XPCOMUtils.defineLazyPreferenceGetter( + this, + "MAX_BACKGROUND_FETCHES", + "browser.pagedata.maxBackgroundFetches", + 5, + () => this.#startBackgroundWorkers() + ); + } + + /** + * Initializes a new instance of the service, not called externally. + */ + init() { + if (!Services.prefs.getBoolPref("browser.pagedata.enabled", false)) { + return; + } + + ChromeUtils.registerWindowActor("PageData", { + parent: { + esModuleURI: "resource:///actors/PageDataParent.sys.mjs", + }, + child: { + esModuleURI: "resource:///actors/PageDataChild.sys.mjs", + events: { + DOMContentLoaded: {}, + pageshow: {}, + }, + }, + }); + + lazy.logConsole.debug("Service started"); + + for (let win of lazy.BrowserWindowTracker.orderedWindows) { + if (!win.closed) { + // Ask any existing tabs to report + for (let tab of win.gBrowser.tabs) { + let parent = + tab.linkedBrowser.browsingContext?.currentWindowGlobal.getActor( + "PageData" + ); + + parent.sendAsyncMessage("PageData:CheckLoaded"); + } + } + } + + lazy.idleService.addIdleObserver(this, lazy.fetchIdleTime); + } + + /** + * Called when the service is destroyed. This is generally on shutdown so we + * don't really need to do much cleanup. + */ + uninit() { + lazy.logConsole.debug("Service stopped"); + } + + /** + * Starts tracking for when a browser is destroyed. + * + * @param {Browser} browser + * The browser to track. + */ + #trackBrowser(browser) { + let window = browser.ownerGlobal; + + let browsers = this.#trackedWindows.get(window); + if (browsers) { + browsers.add(browser); + + // This window is already being tracked, no need to add listeners. + return; + } + + browsers = new Set([browser]); + this.#trackedWindows.set(window, browsers); + + window.addEventListener("unload", () => { + for (let closedBrowser of browsers) { + this.unlockEntry(closedBrowser); + } + + this.#trackedWindows.delete(window); + }); + + window.addEventListener("TabClose", ({ target: tab }) => { + // Unlock any entries locked by this browser. + let closedBrowser = tab.linkedBrowser; + this.unlockEntry(closedBrowser); + browsers.delete(closedBrowser); + }); + } + + /** + * Requests that any page data for this url is retained in memory until + * unlocked. By calling this you are committing to later call `unlockEntry` + * with the same `actor` and `url` parameters. + * + * @param {object} actor + * The actor requesting the lock. + * @param {string} url + * The url of the page to lock. + */ + lockEntry(actor, url) { + this.#pageDataCache.lockData(actor, url); + } + + /** + * Notifies that an actor is no longer interested in a url. + * + * @param {object} actor + * The actor that requested the lock. + * @param {string | undefined} [url] + * The url of the page or undefined to unlock all urls locked by this actor. + */ + unlockEntry(actor, url) { + this.#pageDataCache.unlockData(actor, url); + } + + /** + * Called when the content process signals that a page is ready for data + * collection. + * + * @param {PageDataParent} actor + * The parent actor for the page. + * @param {string} url + * The url of the page. + */ + async pageLoaded(actor, url) { + let uri = Services.io.newURI(url); + if (!ALLOWED_SCHEMES.includes(uri.scheme)) { + return; + } + + let browser = actor.browsingContext?.embedderElement; + + // If we don't have a browser then it went away before we could record, + // so we don't know where the data came from. + if (!browser) { + return; + } + + // Is this a load in a background browser? + let backgroundResolve = this.#backgroundBrowsers.get(browser); + if (backgroundResolve) { + backgroundResolve(actor); + return; + } + + // Otherwise we only care about pages loaded in the tab browser. + if (!this.#isATabBrowser(browser)) { + return; + } + + try { + let data = await actor.collectPageData(); + if (data) { + // Keep this data alive until the browser is destroyed. + this.#trackBrowser(browser); + this.lockEntry(browser, data.url); + + this.pageDataDiscovered(data); + } + } catch (e) { + lazy.logConsole.error(e); + } + } + + /** + * Adds data for a url. This should generally only be called by other components of the + * page data service or tests for simulating page data collection. + * + * @param {PageData} pageData + * The set of data discovered. + */ + pageDataDiscovered(pageData) { + lazy.logConsole.debug("Discovered page data", pageData); + + this.#pageDataCache.set(pageData.url, { + ...pageData, + data: pageData.data ?? {}, + }); + + // Send out a notification. + this.emit("page-data", pageData); + } + + /** + * Retrieves any cached page data. Returns null if there is no information in the cache, this will + * happen either if the page has not been browsed recently or if data collection failed for some + * reason. + * + * @param {string} url + * The url to retrieve data for. + * @returns {PageData|null} + * A `PageData` if one is cached (it may not actually contain any items of data) or null if this + * page has not been successfully checked for data recently. + */ + getCached(url) { + return this.#pageDataCache.get(url); + } + + /** + * Fetches page data from the given URL using a hidden window. Note that this does not populate + * the page data cache or emit the `page-data` event. + * + * @param {string} url + * The url to retrieve data for. + * @returns {Promise<PageData|null>} + * Resolves to the found pagedata or null in case of error. + */ + async fetchPageData(url) { + return this.#browserManager.withHiddenBrowser(async browser => { + try { + let { promise, resolve } = Promise.withResolvers(); + this.#backgroundBrowsers.set(browser, resolve); + + let principal = Services.scriptSecurityManager.getSystemPrincipal(); + let oa = lazy.E10SUtils.predictOriginAttributes({ + browser, + }); + let loadURIOptions = { + triggeringPrincipal: principal, + remoteType: lazy.E10SUtils.getRemoteTypeForURI( + url, + true, + false, + lazy.E10SUtils.DEFAULT_REMOTE_TYPE, + null, + oa + ), + }; + browser.fixupAndLoadURIString(url, loadURIOptions); + + let actor = await promise; + return await actor.collectPageData(); + } finally { + this.#backgroundBrowsers.delete(browser); + } + }); + } + + /** + * Handles notifications from the idle service. + * + * @param {nsISupports} subject + * The notification's subject. + * @param {string} topic + * The notification topic. + * @param {string} data + * The data associated with the notification. + */ + observe(subject, topic, data) { + switch (topic) { + case "idle": + lazy.logConsole.debug("User went idle"); + this.#userIsIdle = true; + this.#startBackgroundWorkers(); + break; + case "active": + lazy.logConsole.debug("User became active"); + this.#userIsIdle = false; + break; + } + } + + /** + * Starts as many background workers as are allowed to process the background + * queue. + */ + #startBackgroundWorkers() { + if (!this.#userIsIdle) { + return; + } + + let toStart; + + if (this.MAX_BACKGROUND_FETCHES) { + toStart = this.MAX_BACKGROUND_FETCHES - this.#backgroundFetches; + } else { + toStart = this.#backgroundQueue.size; + } + + for (let i = 0; i < toStart; i++) { + this.#backgroundFetch(); + } + } + + /** + * Starts a background fetch worker which will pull urls from the queue and + * load them until the queue is empty. + */ + async #backgroundFetch() { + this.#backgroundFetches++; + + let url = shift(this.#backgroundQueue); + while (url) { + try { + let pageData = await this.fetchPageData(url); + + if (pageData) { + this.#pageDataCache.set(url, pageData); + this.emit("page-data", pageData); + } + } catch (e) { + lazy.logConsole.error(e); + } + + // Check whether the user became active or the worker limit changed + // dynamically. + if ( + !this.#userIsIdle || + (this.MAX_BACKGROUND_FETCHES > 0 && + this.#backgroundFetches > this.MAX_BACKGROUND_FETCHES) + ) { + break; + } + + url = shift(this.#backgroundQueue); + } + + this.#backgroundFetches--; + } + + /** + * Queues page data retrieval for a url. The page-data notification will be + * generated if data becomes available. + * + * Check `getCached` first to ensure that data is not already in the cache. + * + * @param {string} url + * The url to retrieve data for. + */ + queueFetch(url) { + this.#backgroundQueue.add(url); + + this.#startBackgroundWorkers(); + } + + /** + * Determines if the given browser is contained within a tab. + * + * @param {DOMElement} browser + * The browser element to check. + * @returns {boolean} + * True if the browser element is contained within a tab. + */ + #isATabBrowser(browser) { + return browser.ownerGlobal.gBrowser?.getTabForBrowser(browser); + } +})(); diff --git a/browser/components/pagedata/SchemaOrgPageData.sys.mjs b/browser/components/pagedata/SchemaOrgPageData.sys.mjs new file mode 100644 index 0000000000..449572c76f --- /dev/null +++ b/browser/components/pagedata/SchemaOrgPageData.sys.mjs @@ -0,0 +1,441 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { PageDataSchema } from "resource:///modules/pagedata/PageDataSchema.sys.mjs"; + +/** + * Represents an item from the schema.org specification. + * + * Every `Item` has a type and a set of properties. Each property has a string + * name and a list of values. It often isn't clear from the spec whether a + * property is expected to have a list of values or just one value so this + * data structure stores every property as a list and provides a simple method + * to get the first property value. + */ +class Item { + /** @type {string} The type of the item e.g. "Product" or "Person". */ + type; + + /** @type {Map<string, any[]>} Properties of the item. */ + properties = new Map(); + + /** + * Constructors a new `Item` of the given type. + * + * @param {string} type + * The type of the item. + */ + constructor(type) { + this.type = type; + } + + /** + * Tests whether a property has any values in this item. + * + * @param {string} prop + * The name of the property. + * @returns {boolean} + */ + has(prop) { + return this.properties.has(prop); + } + + /** + * Gets all of the values for a property. This may return an empty array if + * there are no values. + * + * @param {string} prop + * The name of the property. + * @returns {any[]} + */ + all(prop) { + return this.properties.get(prop) ?? []; + } + + /** + * Gets the first value for a property. + * + * @param {string} prop + * The name of the property. + * @returns {any} + */ + get(prop) { + return this.properties.get(prop)?.[0]; + } + + /** + * Sets a value for a property. + * + * @param {string} prop + * The name of the property. + * @param {any} value + * The value of the property. + */ + set(prop, value) { + let props = this.properties.get(prop); + if (props === undefined) { + props = []; + this.properties.set(prop, props); + } + + props.push(value); + } + + /** + * Converts this item to JSON-LD. + * + * Single array properties are converted into simple properties. + * + * @returns {object} + */ + toJsonLD() { + /** + * Converts a value to its JSON-LD representation. + * + * @param {any} val + * The value to convert. + * @returns {any} + */ + function toLD(val) { + if (val instanceof Item) { + return val.toJsonLD(); + } + return val; + } + + let props = Array.from(this.properties, ([key, value]) => { + if (value.length == 1) { + return [key, toLD(value[0])]; + } + + return [key, value.map(toLD)]; + }); + + return { + "@type": this.type, + ...Object.fromEntries(props), + }; + } +} + +/** + * Parses the value for a given microdata property. + * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec + * + * @param {Element} propElement + * The property element. + * @returns {any} + * The value of the property. + */ +function parseMicrodataProp(propElement) { + if (propElement.hasAttribute("itemscope")) { + throw new Error( + "Cannot parse a simple property value from an itemscope element." + ); + } + + const parseUrl = (urlElement, attr) => { + if (!urlElement.hasAttribute(attr)) { + return ""; + } + + try { + let url = new URL( + urlElement.getAttribute(attr), + urlElement.ownerDocument.documentURI + ); + return url.toString(); + } catch (e) { + return ""; + } + }; + + switch (propElement.localName) { + case "meta": + return propElement.getAttribute("content") ?? ""; + case "audio": + case "embed": + case "iframe": + case "source": + case "track": + case "video": + return parseUrl(propElement, "src"); + case "img": + // Some pages may be using a lazy loading approach to images, putting a + // temporary image in "src" while the real image is in a differently + // named attribute. So far we found "content" and "data-src" are common + // names for that attribute. + return ( + parseUrl(propElement, "content") || + parseUrl(propElement, "data-src") || + parseUrl(propElement, "src") + ); + case "object": + return parseUrl(propElement, "data"); + case "a": + case "area": + case "link": + return parseUrl(propElement, "href"); + case "data": + case "meter": + return propElement.getAttribute("value"); + case "time": + if (propElement.hasAtribute("datetime")) { + return propElement.getAttribute("datetime"); + } + return propElement.textContent; + default: + // Not mentioned in the spec but sites seem to use it. + if (propElement.hasAttribute("content")) { + return propElement.getAttribute("content"); + } + return propElement.textContent; + } +} + +/** + * Collects product data from an item. + * + * @param {Document} document + * The document the item comes from. + * @param {PageData} pageData + * The pageData object to add to. + * @param {Item} item + * The product item. + */ +function collectProduct(document, pageData, item) { + if (item.has("image")) { + let url = new URL(item.get("image"), document.documentURI); + pageData.image = url.toString(); + } + + if (item.has("description")) { + pageData.description = item.get("description"); + } + + pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = { + name: item.get("name"), + }; + + for (let offer of item.all("offers")) { + if (!(offer instanceof Item) || offer.type != "Offer") { + continue; + } + + let price = parseFloat(offer.get("price")); + if (!isNaN(price)) { + pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = { + value: price, + currency: offer.get("priceCurrency"), + }; + + break; + } + } +} + +/** + * Returns the root microdata items from the given document. + * + * @param {Document} document + * The DOM document to collect from. + * @returns {Item[]} + */ +function collectMicrodataItems(document) { + // First find all of the items in the document. + let itemElements = document.querySelectorAll( + "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']" + ); + + /** + * Maps elements to the closest item. + * + * @type {Map<Element, Item>} + */ + let items = new Map(); + + /** + * Finds the item for an element. Throws if there is no item. Caches the + * result. + * + * @param {Element} element + * The element to search from. + * @returns {Item} + */ + function itemFor(element) { + let item = items.get(element); + if (item) { + return item; + } + + if (!element.parentElement) { + throw new Error("Element has no parent item."); + } + + item = itemFor(element.parentElement); + items.set(element, item); + return item; + } + + for (let element of itemElements) { + let itemType = element.getAttribute("itemtype"); + // Strip off the base url + if (itemType.startsWith("https://")) { + itemType = itemType.substring(19); + } else { + itemType = itemType.substring(18); + } + + items.set(element, new Item(itemType)); + } + + // The initial roots are just all the items. + let roots = new Set(items.values()); + + // Now find all item properties. + let itemProps = document.querySelectorAll( + "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]" + ); + + for (let element of itemProps) { + // The item is always defined above the current element. + let item = itemFor(element.parentElement); + + // The properties value is either a nested item or a simple value. + let propValue = items.get(element) ?? parseMicrodataProp(element); + item.set(element.getAttribute("itemprop"), propValue); + + if (propValue instanceof Item) { + // This item belongs to another item and so is not a root item. + roots.delete(propValue); + } + } + + return [...roots]; +} + +/** + * Returns the root JSON-LD items from the given document. + * + * @param {Document} document + * The DOM document to collect from. + * @returns {Item[]} + */ +function collectJsonLDItems(document) { + /** + * The root items. + * + * @type {Item[]} + */ + let items = []; + + /** + * Converts a JSON-LD value into an Item if appropriate. + * + * @param {any} val + * The value to convert. + * @returns {any} + */ + function fromLD(val) { + if (typeof val == "object" && "@type" in val) { + let item = new Item(val["@type"]); + + for (let [prop, value] of Object.entries(val)) { + // Ignore meta properties. + if (prop.startsWith("@")) { + continue; + } + + if (!Array.isArray(value)) { + value = [value]; + } + + item.properties.set(prop, value.map(fromLD)); + } + + return item; + } + + return val; + } + + let scripts = document.querySelectorAll("script[type='application/ld+json'"); + for (let script of scripts) { + try { + let content = JSON.parse(script.textContent); + + if (typeof content != "object") { + continue; + } + + if (!("@context" in content)) { + continue; + } + + if ( + content["@context"] != "http://schema.org" && + content["@context"] != "https://schema.org" + ) { + continue; + } + + let item = fromLD(content); + if (item instanceof Item) { + items.push(item); + } + } catch (e) { + // Unparsable content. + } + } + + return items; +} + +/** + * Collects schema.org related data from a page. + * + * Currently only supports HTML Microdata and JSON-LD formats, not RDFa. + */ +export const SchemaOrgPageData = { + /** + * Parses and collects the schema.org items from the given document. + * The returned items are the roots, i.e. the top-level items, there may be + * other items as nested properties. + * + * @param {Document} document + * The DOM document to parse. + * @returns {Item[]} + */ + collectItems(document) { + return collectMicrodataItems(document).concat(collectJsonLDItems(document)); + }, + + /** + * Performs PageData collection from the given document. + * + * @param {Document} document + * The DOM document to collect from. + * @returns {PageData} + */ + collect(document) { + let pageData = { data: {} }; + + let items = this.collectItems(document); + + for (let item of items) { + switch (item.type) { + case "Product": + if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) { + collectProduct(document, pageData, item); + } + break; + case "Organization": + pageData.siteName = item.get("name"); + break; + } + } + + return pageData; + }, +}; diff --git a/browser/components/pagedata/TwitterPageData.sys.mjs b/browser/components/pagedata/TwitterPageData.sys.mjs new file mode 100644 index 0000000000..88b06098cb --- /dev/null +++ b/browser/components/pagedata/TwitterPageData.sys.mjs @@ -0,0 +1,42 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Collects Twitter card (https://developer.twitter.com/en/docs/twitter-for-websites/) + * related data from a page. + */ +export const TwitterPageData = { + /** + * Collects the twitter data from the page. + * + * @param {Document} document + * The document to collect from + * + * @returns {PageData} + */ + collect(document) { + let pageData = {}; + + let twitterTags = document.querySelectorAll("meta[name^='twitter:'"); + + for (let tag of twitterTags) { + // Strip "twitter:" from the property name. + let propertyName = tag.getAttribute("name").substring(8); + + switch (propertyName) { + case "site": + pageData.siteName = tag.getAttribute("content"); + break; + case "description": + pageData.description = tag.getAttribute("content"); + break; + case "image": + pageData.image = tag.getAttribute("content"); + break; + } + } + + return pageData; + }, +}; diff --git a/browser/components/pagedata/docs/index.md b/browser/components/pagedata/docs/index.md new file mode 100644 index 0000000000..47b507d13a --- /dev/null +++ b/browser/components/pagedata/docs/index.md @@ -0,0 +1,50 @@ +# PageDataService + +The page data service is responsible for collecting additional data about a page. This could include +information about the media on a page, product information, etc. When enabled it will automatically +try to find page data for pages that the user browses or it can be directed to asynchronously look +up the page data for a url. + +The `PageDataService` is an EventEmitter and listeners can subscribe to its notifications via the +`on` and `once` methods. + +The service can be enabled by setting `browser.pagedata.enabled` to true. Additional logging can be +enabled by setting `browser.pagedata.log` to true. + +## PageData Data Structure + +At a high level the page data service can collect many different kinds of data. When queried the +service will respond with a `PageData` structure which holds some general information about the +page, the time when the data was discovered and a map of the different types of data found. This map +will be empty if no specific data was found. The key of the map is from the +`PageDataSchema.DATA_TYPE` enumeration. The value is the JSON data which differs in structure +depending on the data type. + +``` +{ + "url": <url of the page as a string>, + "date": <epoch based timestamp>, + "siteName": <a friendly name for the website>, + "image": <url for an image for the page as a string>, + "data": <map of data types>, +} +``` + +## PageData Collection + +Page data is gathered in one of two ways. + +Page data is automatically gathered for webpages the user visits. This collection is trigged after +a short delay and then updated when necessary. Any data is cached in memory for a period of time. +When page data has been found a `page-data` event is emitted. The event's argument holds the +`PageData` structure. The `getCached` function can be used to access any cached data for a url. + +## Supported Types of page data + +The following types of page data (`PageDataSchema.DATA_TYPE`) are currently supported: + +- `PRODUCT` +- `DOCUMENT` +- `ARTICLE` +- `AUDIO` +- `VIDEO` diff --git a/browser/components/pagedata/jar.mn b/browser/components/pagedata/jar.mn new file mode 100644 index 0000000000..19860a30ee --- /dev/null +++ b/browser/components/pagedata/jar.mn @@ -0,0 +1,6 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +browser.jar: + content/browser/pagedata/schemas/ (schemas/*.json) diff --git a/browser/components/pagedata/moz.build b/browser/components/pagedata/moz.build new file mode 100644 index 0000000000..f1e49c4e4b --- /dev/null +++ b/browser/components/pagedata/moz.build @@ -0,0 +1,29 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +XPCSHELL_TESTS_MANIFESTS += [ + "tests/unit/xpcshell.toml", +] +BROWSER_CHROME_MANIFESTS += [ + "tests/browser/browser.toml", +] + +JAR_MANIFESTS += ["jar.mn"] + +EXTRA_JS_MODULES.pagedata += [ + "OpenGraphPageData.sys.mjs", + "PageDataSchema.sys.mjs", + "PageDataService.sys.mjs", + "SchemaOrgPageData.sys.mjs", + "TwitterPageData.sys.mjs", +] + +FINAL_TARGET_FILES.actors += [ + "PageDataChild.sys.mjs", + "PageDataParent.sys.mjs", +] + +SPHINX_TREES["docs"] = "docs" diff --git a/browser/components/pagedata/schemas/article.schema.json b/browser/components/pagedata/schemas/article.schema.json new file mode 100644 index 0000000000..e02bb11655 --- /dev/null +++ b/browser/components/pagedata/schemas/article.schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "article.schema.json", + "title": "Article", + "description": "An article for reading", + "type": "object", + "properties": { + "name": { + "description": "The article's name", + "type": "string" + }, + "author": { + "description": "The author(s) of the article", + "type": "string" + }, + "date": { + "description": "The date the article was published in ISO-8601 date or date/time format", + "type": "string" + }, + "readingTime": { + "description": "The expected time to read the article in seconds", + "type": "number" + } + }, + "required": ["name"] +} diff --git a/browser/components/pagedata/schemas/audio.schema.json b/browser/components/pagedata/schemas/audio.schema.json new file mode 100644 index 0000000000..db1b79b55c --- /dev/null +++ b/browser/components/pagedata/schemas/audio.schema.json @@ -0,0 +1,34 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "audio.schema.json", + "title": "Audio", + "description": "An audio file", + "type": "object", + "properties": { + "name": { + "description": "The audio's name", + "type": "string" + }, + "duration": { + "description": "The audio's duration in seconds", + "type": "number" + }, + "artist": { + "description": "The artist who created the audio", + "type": "string" + }, + "album": { + "description": "For music on an album the name of the album", + "type": "string" + }, + "track": { + "description": "For music on an album the number of the track on the album", + "type": "number" + }, + "genre": { + "description": "The genre of the audio", + "type": "string" + } + }, + "required": ["name"] +} diff --git a/browser/components/pagedata/schemas/document.schema.json b/browser/components/pagedata/schemas/document.schema.json new file mode 100644 index 0000000000..849010773b --- /dev/null +++ b/browser/components/pagedata/schemas/document.schema.json @@ -0,0 +1,18 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "document.schema.json", + "title": "Document", + "description": "A document of some kind, either viewable or editable", + "type": "object", + "properties": { + "name": { + "description": "The document's name", + "type": "string" + }, + "mimeType": { + "description": "The document's mimetype", + "type": "string" + } + }, + "required": ["name"] +} diff --git a/browser/components/pagedata/schemas/general.schema.json b/browser/components/pagedata/schemas/general.schema.json new file mode 100644 index 0000000000..a400fd889b --- /dev/null +++ b/browser/components/pagedata/schemas/general.schema.json @@ -0,0 +1,30 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "general.schema.json", + "title": "General", + "description": "General data about a page", + "type": "object", + "properties": { + "url": { + "description": "The page's url", + "type": "string" + }, + "date": { + "description": "The date the data was collected as a timestamp", + "type": "number" + }, + "description": { + "description": "A description of the page", + "type": "string" + }, + "siteName": { + "description": "A friendly name for the site", + "type": "string" + }, + "image": { + "description": "The url for an image representative of the page", + "type": "string" + } + }, + "required": ["url", "date"] +} diff --git a/browser/components/pagedata/schemas/product.schema.json b/browser/components/pagedata/schemas/product.schema.json new file mode 100644 index 0000000000..77bec76ff2 --- /dev/null +++ b/browser/components/pagedata/schemas/product.schema.json @@ -0,0 +1,46 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "product.schema.json", + "title": "Product", + "description": "A product that can be purchased", + "type": "object", + "properties": { + "name": { + "description": "The product's name", + "type": "string" + }, + "brand": { + "description": "The product's brand", + "type": "string" + }, + "price": { + "description": "The cost of a single unit", + "type": "object", + "properties": { + "value": { + "type": "number" + }, + "currency": { + "description": "The currency for the value", + "type": "string" + } + }, + "required": ["value"] + }, + "shippingCost": { + "description": "The cost of shipping", + "type": "object", + "properties": { + "value": { + "type": "number" + }, + "currency": { + "description": "The currency for the value", + "type": "string" + } + }, + "required": ["value"] + } + }, + "required": ["name"] +} diff --git a/browser/components/pagedata/schemas/video.schema.json b/browser/components/pagedata/schemas/video.schema.json new file mode 100644 index 0000000000..1091ebfe89 --- /dev/null +++ b/browser/components/pagedata/schemas/video.schema.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "video.schema.json", + "title": "Video", + "description": "A video", + "type": "object", + "properties": { + "name": { + "description": "The video's name", + "type": "string" + }, + "duration": { + "description": "The video's duration in seconds", + "type": "number" + }, + "quality": { + "description": "A short description of the video's quality (e.g. 'HD', '720p')", + "type": "string" + }, + "show": { + "description": "For an episode of a TV show the name of the TV show", + "type": "string" + }, + "season": { + "description": "For an episode of a TV show the season number it appears in", + "type": "number" + }, + "episode": { + "description": "For an episode of a TV show the number of the episode in the season", + "type": "number" + }, + "genre": { + "description": "The genre of the video", + "type": "string" + } + }, + "required": ["name"] +} diff --git a/browser/components/pagedata/tests/browser/browser.toml b/browser/components/pagedata/tests/browser/browser.toml new file mode 100644 index 0000000000..8bcd7a539b --- /dev/null +++ b/browser/components/pagedata/tests/browser/browser.toml @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +[DEFAULT] +prefs = [ + "browser.pagedata.log=true", + "browser.pagedata.enabled=true", +] +support-files = ["head.js"] + +["browser_pagedata_background.js"] + +["browser_pagedata_basic.js"] + +["browser_pagedata_cache.js"] diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_background.js b/browser/components/pagedata/tests/browser/browser_pagedata_background.js new file mode 100644 index 0000000000..bba2ae2e47 --- /dev/null +++ b/browser/components/pagedata/tests/browser/browser_pagedata_background.js @@ -0,0 +1,48 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Background load tests for the page data service. + */ + +const TEST_URL = + "data:text/html," + + encodeURIComponent(` + <html> + <head> + <meta name="twitter:card" content="summary_large_image"> + <meta name="twitter:site" content="@nytimes"> + <meta name="twitter:creator" content="@SarahMaslinNir"> + <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral"> + <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines"> + <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg"> + </head> + <body> + </body> + </html> +`); + +add_task(async function test_pagedata_no_data() { + let pageData = await PageDataService.fetchPageData(TEST_URL); + + delete pageData.date; + Assert.deepEqual( + pageData, + { + url: TEST_URL, + siteName: "@nytimes", + description: "NEWARK - The guest list and parade of limousines", + image: + "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg", + data: {}, + }, + "Should have returned the right data" + ); + + Assert.equal( + PageDataService.getCached(TEST_URL), + null, + "Should not have cached this data" + ); +}); diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_basic.js b/browser/components/pagedata/tests/browser/browser_pagedata_basic.js new file mode 100644 index 0000000000..4984645274 --- /dev/null +++ b/browser/components/pagedata/tests/browser/browser_pagedata_basic.js @@ -0,0 +1,64 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Basic tests for the page data service. + */ + +const TEST_URL = "https://example.com/"; +const TEST_URL2 = "https://example.com/browser"; + +add_task(async function test_pagedata_no_data() { + let promise = PageDataService.once("page-data"); + + await BrowserTestUtils.withNewTab(TEST_URL, async browser => { + let pageData = await promise; + Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL"); + Assert.deepEqual(pageData.data, {}, "Should have returned no data"); + Assert.deepEqual( + PageDataService.getCached(TEST_URL), + pageData, + "Should return the same data from the cache" + ); + + promise = PageDataService.once("page-data"); + BrowserTestUtils.startLoadingURIString(browser, TEST_URL2); + await BrowserTestUtils.browserLoaded(browser, false, TEST_URL2); + pageData = await promise; + Assert.equal( + pageData.url, + TEST_URL2, + "Should have returned the loaded URL" + ); + Assert.deepEqual(pageData.data, {}, "Should have returned no data"); + Assert.deepEqual( + PageDataService.getCached(TEST_URL2), + pageData, + "Should return the same data from the cache" + ); + + info("Test going back still triggers collection"); + + promise = PageDataService.once("page-data"); + let locationChangePromise = BrowserTestUtils.waitForLocationChange( + gBrowser, + TEST_URL + ); + browser.goBack(); + await locationChangePromise; + pageData = await promise; + + Assert.equal( + pageData.url, + TEST_URL, + "Should have returned the URL of the previous page" + ); + Assert.deepEqual(pageData.data, {}, "Should have returned no data"); + Assert.deepEqual( + PageDataService.getCached(TEST_URL), + pageData, + "Should return the same data from the cache" + ); + }); +}); diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_cache.js b/browser/components/pagedata/tests/browser/browser_pagedata_cache.js new file mode 100644 index 0000000000..e41b4ea2f8 --- /dev/null +++ b/browser/components/pagedata/tests/browser/browser_pagedata_cache.js @@ -0,0 +1,155 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Tests for the page data cache. + */ + +const TEST_URL = + "data:text/html," + + encodeURIComponent(` + <!DOCTYPE html> + <html> + <head> + <meta charset="utf-8"> + <meta name="twitter:card" content="summary_large_image"> + <meta name="twitter:site" content="@nytimes"> + <meta name="twitter:creator" content="@SarahMaslinNir"> + <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral"> + <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines"> + <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg"> + </head> + <body> + </body> + </html> +`); + +/** + * Runs a task with a new page loaded into a tab in a new browser window. + * + * @param {string} url + * The url to load. + * @param {Function} task + * The task to run. May return a promise. + */ +async function withBrowserInNewWindow(url, task) { + let newWin = await BrowserTestUtils.openNewBrowserWindow(); + let tab = await BrowserTestUtils.openNewForegroundTab(newWin.gBrowser, url); + await task(tab.linkedBrowser); + await BrowserTestUtils.closeWindow(newWin); +} + +add_task(async function test_pagedata_cache() { + let promise = PageDataService.once("page-data"); + + Assert.equal( + PageDataService.getCached(TEST_URL), + null, + "Should be no data cached." + ); + + await BrowserTestUtils.withNewTab(TEST_URL, async () => { + let pageData = await promise; + + Assert.deepEqual( + PageDataService.getCached(TEST_URL), + pageData, + "Should return the same data from the cache" + ); + + delete pageData.date; + + Assert.deepEqual( + pageData, + { + url: TEST_URL, + siteName: "@nytimes", + description: "NEWARK - The guest list and parade of limousines", + image: + "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg", + data: {}, + }, + "Should have returned the right data" + ); + }); + + Assert.equal( + PageDataService.getCached(TEST_URL), + null, + "Data should no longer be cached." + ); + + promise = PageDataService.once("page-data"); + + // Checks that closing a window containing a tracked tab stops tracking the tab. + await withBrowserInNewWindow(TEST_URL, async () => { + let pageData = await promise; + + Assert.deepEqual( + PageDataService.getCached(TEST_URL), + pageData, + "Should return the same data from the cache" + ); + + delete pageData.date; + Assert.deepEqual( + pageData, + { + url: TEST_URL, + siteName: "@nytimes", + description: "NEWARK - The guest list and parade of limousines", + image: + "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg", + data: {}, + }, + "Should have returned the right data" + ); + }); + + Assert.equal( + PageDataService.getCached(TEST_URL), + null, + "Data should no longer be cached." + ); + + let actor = {}; + PageDataService.lockEntry(actor, TEST_URL); + + promise = PageDataService.once("page-data"); + + // Closing a tracked tab shouldn't expire the data here as we have another lock. + await BrowserTestUtils.withNewTab(TEST_URL, async () => { + await promise; + }); + + promise = PageDataService.once("page-data"); + + // Closing a window with a tracked tab shouldn't expire the data here as we have another lock. + await withBrowserInNewWindow(TEST_URL, async () => { + await promise; + }); + + let cached = PageDataService.getCached(TEST_URL); + delete cached.date; + Assert.deepEqual( + cached, + { + url: TEST_URL, + siteName: "@nytimes", + description: "NEWARK - The guest list and parade of limousines", + image: + "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg", + data: {}, + }, + "Entry should still be cached" + ); + + PageDataService.unlockEntry(actor, TEST_URL); + + Assert.equal( + PageDataService.getCached(TEST_URL), + null, + "Data should no longer be cached." + ); +}); diff --git a/browser/components/pagedata/tests/browser/head.js b/browser/components/pagedata/tests/browser/head.js new file mode 100644 index 0000000000..b4f57cdb76 --- /dev/null +++ b/browser/components/pagedata/tests/browser/head.js @@ -0,0 +1,8 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +ChromeUtils.defineESModuleGetters(this, { + PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs", + PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs", +}); diff --git a/browser/components/pagedata/tests/unit/head.js b/browser/components/pagedata/tests/unit/head.js new file mode 100644 index 0000000000..55b002692b --- /dev/null +++ b/browser/components/pagedata/tests/unit/head.js @@ -0,0 +1,105 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +const { XPCOMUtils } = ChromeUtils.importESModule( + "resource://gre/modules/XPCOMUtils.sys.mjs" +); + +ChromeUtils.defineESModuleGetters(this, { + PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs", +}); + +const { HttpServer } = ChromeUtils.importESModule( + "resource://testing-common/httpd.sys.mjs" +); + +const server = new HttpServer(); +server.start(-1); + +const SERVER_PORT = server.identity.primaryPort; +const BASE_URL = "http://localhost:" + SERVER_PORT; +const DEFAULT_PATH = "/document.html"; +const TEST_URL = BASE_URL + DEFAULT_PATH; + +registerCleanupFunction(() => { + server.stop(); +}); + +do_get_profile(); +Services.prefs.setBoolPref("browser.pagedata.log", true); + +/** + * Given a string parses it as HTML into a DOM Document object. + * + * @param {string} str + * The string to parse. + * @param {string} path + * The path for the document on the server, defaults to "/document.html" + * @returns {Promise<Document>} the HTML DOM Document object. + */ +function parseDocument(str, path = DEFAULT_PATH) { + server.registerPathHandler(path, (request, response) => { + response.setHeader("Content-Type", "text/html;charset=utf-8"); + + let converter = Cc[ + "@mozilla.org/intl/converter-output-stream;1" + ].createInstance(Ci.nsIConverterOutputStream); + converter.init(response.bodyOutputStream, "utf-8"); + converter.writeString(str); + }); + + return new Promise((resolve, reject) => { + let request = new XMLHttpRequest(); + request.responseType = "document"; + request.open("GET", BASE_URL + path, true); + + request.addEventListener("error", reject); + request.addEventListener("abort", reject); + + request.addEventListener("load", function () { + resolve(request.responseXML); + }); + + request.send(); + }); +} + +/** + * Parses page data from a HTML string. + * + * @param {string} str + * The HTML string to parse. + * @param {string} path + * The path for the document on the server, defaults to "/document.html" + * @returns {Promise<PageData>} A promise that resolves to the page data found. + */ +async function parsePageData(str, path) { + let doc = await parseDocument(str, path); + return PageDataSchema.collectPageData(doc); +} + +/** + * Verifies that the HTML string given parses to the expected page data. + * + * @param {string} str + * The HTML string to parse. + * @param {PageData} expected + * The expected pagedata excluding the date and url properties. + * @param {string} path + * The path for the document on the server, defaults to "/document.html" + * @returns {Promise<PageData>} A promise that resolves to the page data found. + */ +async function verifyPageData(str, expected, path = DEFAULT_PATH) { + let pageData = await parsePageData(str, path); + + delete pageData.date; + + Assert.equal(pageData.url, BASE_URL + path); + delete pageData.url; + + Assert.deepEqual( + pageData, + expected, + "Should have seen the expected page data." + ); +} diff --git a/browser/components/pagedata/tests/unit/test_opengraph.js b/browser/components/pagedata/tests/unit/test_opengraph.js new file mode 100644 index 0000000000..e5accaf675 --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_opengraph.js @@ -0,0 +1,67 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Tests that the page data service can parse Open Graph metadata. + */ + +add_task(async function test_type_website() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <title>Internet for people, not profit — Mozilla</title> + <meta property="og:type" content="website"> + <meta property="og:site_name" content="Mozilla"> + <meta property="og:url" content="https://www.mozilla.org/"> + <meta property="og:image" content="https://example.com/preview-image"> + <meta property="og:title" content="Internet for people, not profit"> + <!-- We expect the test will ignore tags the parser does not recognize. --> + <meta property="og:locale" content="en_CA"> + <meta property="og:description" content="Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online."> + </head> + <body> + <p>Test page</p> + </body> + </html> + `, + { + siteName: "Mozilla", + description: + "Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.", + image: "https://example.com/preview-image", + data: {}, + } + ); +}); + +add_task(async function test_type_movie() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <title>Code Rush (TV Movie 2000)</title> + <meta property="og:url" content="https://www.imdb.com/title/tt0499004/"/> + <!-- Omitting og:site_name to test that the parser doesn't break on missing tags. --> + <meta property="og:title" content="Code Rush (TV Movie 2000) - IMDb"/> + <meta property="og:description" content="This is the description of the movie."/> + <meta property="og:type" content="video.movie"/> + <meta property="og:image" content="https://example.com/preview-code-rush"/> + <meta property="og:image:height" content="750"/> + <meta property="og:image:width" content="1000"/> + </head> + <body> + <p>Test page</p> + </body> + </html> + `, + { + image: "https://example.com/preview-code-rush", + description: "This is the description of the movie.", + data: {}, + } + ); +}); diff --git a/browser/components/pagedata/tests/unit/test_pagedata_basic.js b/browser/components/pagedata/tests/unit/test_pagedata_basic.js new file mode 100644 index 0000000000..5d31645a4c --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_pagedata_basic.js @@ -0,0 +1,100 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +/* + * Simply tests that the notification is dispatched when new page data is + * discovered. + */ + +ChromeUtils.defineESModuleGetters(this, { + PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs", +}); + +add_task(async function test_pageDataDiscovered_notifies() { + let url = "https://www.mozilla.org/"; + + Assert.equal( + PageDataService.getCached(url), + null, + "Should be no cached data." + ); + + let promise = PageDataService.once("page-data"); + + PageDataService.pageDataDiscovered({ + url, + date: 32453456, + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { value: 276 }, + }, + }, + }); + + let pageData = await promise; + Assert.equal( + pageData.url, + url, + "Should have notified data for the expected url" + ); + + Assert.deepEqual( + pageData, + { + url, + date: 32453456, + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { value: 276 }, + }, + }, + }, + "Should have returned the correct product data" + ); + + Assert.equal( + PageDataService.getCached(url), + null, + "Should not have cached the data as there was no actor locking." + ); + + let actor = {}; + PageDataService.lockEntry(actor, url); + + PageDataService.pageDataDiscovered({ + url, + date: 32453456, + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { value: 276 }, + }, + }, + }); + + // Should now be in the cache. + Assert.deepEqual( + PageDataService.getCached(url), + { + url, + date: 32453456, + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { value: 276 }, + }, + }, + }, + "Should have cached the data" + ); + + PageDataService.unlockEntry(actor, url); + + Assert.equal( + PageDataService.getCached(url), + null, + "Should have dropped the data from the cache." + ); +}); diff --git a/browser/components/pagedata/tests/unit/test_pagedata_schema.js b/browser/components/pagedata/tests/unit/test_pagedata_schema.js new file mode 100644 index 0000000000..fcd9c4b297 --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_pagedata_schema.js @@ -0,0 +1,210 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +/* + * Tests schema validation. + */ + +add_task(async function testBasic() { + // Old data types, should not be recognised. + Assert.equal(PageDataSchema.nameForType(1), null); + Assert.equal(PageDataSchema.nameForType(2), null); + + Assert.equal( + PageDataSchema.nameForType(PageDataSchema.DATA_TYPE.VIDEO), + "VIDEO" + ); + Assert.equal( + PageDataSchema.nameForType(PageDataSchema.DATA_TYPE.PRODUCT), + "PRODUCT" + ); +}); + +add_task(async function testProduct() { + // Products must have a name + await Assert.rejects( + PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {}), + /missing required property 'name'/ + ); + + await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, { + name: "Bolts", + }); + + await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, { + name: "Bolts", + price: { + value: 5, + }, + }); + + await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, { + name: "Bolts", + price: { + value: 5, + currency: "USD", + }, + }); + + await Assert.rejects( + PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, { + name: "Bolts", + price: { + currency: "USD", + }, + }), + /missing required property 'value'/ + ); + + await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, { + name: "Bolts", + shippingCost: { + value: 5, + currency: "USD", + }, + }); + + await Assert.rejects( + PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, { + name: "Bolts", + shippingCost: { + currency: "USD", + }, + }), + /missing required property 'value'/ + ); +}); + +add_task(async function testCoalesce() { + let joined = PageDataSchema.coalescePageData({}, {}); + Assert.deepEqual(joined, { data: {} }); + + joined = PageDataSchema.coalescePageData( + { + url: "https://www.google.com/", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "bolts", + }, + [PageDataSchema.DATA_TYPE.VIDEO]: { + name: "My video", + duration: 500, + }, + }, + }, + { + url: "https://www.mozilla.com/", + date: 27, + siteName: "Mozilla", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "newname", + price: { + value: 55, + }, + }, + [PageDataSchema.DATA_TYPE.AUDIO]: { + name: "My song", + }, + }, + } + ); + + Assert.deepEqual(joined, { + url: "https://www.google.com/", + date: 27, + siteName: "Mozilla", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "bolts", + price: { + value: 55, + }, + }, + [PageDataSchema.DATA_TYPE.VIDEO]: { + name: "My video", + duration: 500, + }, + [PageDataSchema.DATA_TYPE.AUDIO]: { + name: "My song", + }, + }, + }); +}); + +add_task(async function testPageData() { + // Full page data needs a url and a date + await Assert.rejects( + PageDataSchema.validatePageData({}), + /missing required property 'url'/ + ); + + await Assert.rejects( + PageDataSchema.validatePageData({ url: "https://www.google.com" }), + /missing required property 'date'/ + ); + + await Assert.rejects( + PageDataSchema.validatePageData({ date: 55 }), + /missing required property 'url'/ + ); + + Assert.deepEqual( + await PageDataSchema.validatePageData({ + url: "https://www.google.com", + date: 55, + }), + { url: "https://www.google.com", date: 55, data: {} } + ); + + Assert.deepEqual( + await PageDataSchema.validatePageData({ + url: "https://www.google.com", + date: 55, + data: { + 0: { + name: "unknown", + }, + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { + value: 55, + }, + }, + }, + }), + { + url: "https://www.google.com", + date: 55, + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { + value: 55, + }, + }, + }, + } + ); + + // Should drop invalid inner data. + Assert.deepEqual( + await PageDataSchema.validatePageData({ + url: "https://www.google.com", + date: 55, + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bolts", + price: { + currency: "USD", + }, + }, + }, + }), + { + url: "https://www.google.com", + date: 55, + data: {}, + } + ); +}); diff --git a/browser/components/pagedata/tests/unit/test_queue.js b/browser/components/pagedata/tests/unit/test_queue.js new file mode 100644 index 0000000000..d683c9a601 --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_queue.js @@ -0,0 +1,512 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +ChromeUtils.defineESModuleGetters(this, { + PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs", + TestUtils: "resource://testing-common/TestUtils.sys.mjs", +}); + +// Test that urls are retrieved in the expected order. +add_task(async function test_queueOrder() { + Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 0); + // Pretend we are idle. + PageDataService.observe(null, "idle", null); + + let pageDataResults = [ + { + date: Date.now(), + url: "http://www.mozilla.org/1", + siteName: "Mozilla", + data: {}, + }, + { + date: Date.now() - 3600, + url: "http://www.google.com/2", + siteName: "Google", + data: {}, + }, + { + date: Date.now() + 3600, + url: "http://www.example.com/3", + image: "http://www.example.com/banner.jpg", + data: {}, + }, + { + date: Date.now() / 2, + url: "http://www.wikipedia.org/4", + data: {}, + }, + { + date: Date.now() / 3, + url: "http://www.microsoft.com/5", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Windows 11", + }, + }, + }, + ]; + + let requests = []; + PageDataService.fetchPageData = url => { + requests.push(url); + + for (let pageData of pageDataResults) { + if (pageData.url == url) { + return Promise.resolve(pageData); + } + } + + return Promise.reject(new Error("Unknown url")); + }; + + let { promise: completePromise, resolve } = Promise.withResolvers(); + + let results = []; + let listener = (_, pageData) => { + results.push(pageData); + if (results.length == pageDataResults.length) { + resolve(); + } + }; + + PageDataService.on("page-data", listener); + + for (let pageData of pageDataResults) { + PageDataService.queueFetch(pageData.url); + } + + await completePromise; + PageDataService.off("page-data", listener); + + Assert.deepEqual( + requests, + pageDataResults.map(pd => pd.url) + ); + + // Because our fetch implementation is essentially synchronous the results + // will be in a known order. This isn't guaranteed by the API though. + Assert.deepEqual(results, pageDataResults); + + delete PageDataService.fetchPageData; +}); + +// Tests that limiting the number of fetches works. +add_task(async function test_queueLimit() { + Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3); + // Pretend we are idle. + PageDataService.observe(null, "idle", null); + + let requests = []; + PageDataService.fetchPageData = url => { + let { promise, resolve, reject } = Promise.withResolvers(); + requests.push({ url, resolve, reject }); + + return promise; + }; + + let results = []; + let listener = (_, pageData) => { + results.push(pageData?.url); + }; + + PageDataService.on("page-data", listener); + + PageDataService.queueFetch("https://www.mozilla.org/1"); + PageDataService.queueFetch("https://www.mozilla.org/2"); + PageDataService.queueFetch("https://www.mozilla.org/3"); + PageDataService.queueFetch("https://www.mozilla.org/4"); + PageDataService.queueFetch("https://www.mozilla.org/5"); + PageDataService.queueFetch("https://www.mozilla.org/6"); + PageDataService.queueFetch("https://www.mozilla.org/7"); + PageDataService.queueFetch("https://www.mozilla.org/8"); + PageDataService.queueFetch("https://www.mozilla.org/9"); + PageDataService.queueFetch("https://www.mozilla.org/10"); + PageDataService.queueFetch("https://www.mozilla.org/11"); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + ] + ); + + // Completing or rejecting a request should start new ones. + + requests[1].resolve({ + date: 2345, + url: "https://www.mozilla.org/2", + siteName: "Test 2", + data: {}, + }); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + ] + ); + + requests[3].reject(new Error("Fail")); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + ] + ); + + // Increasing the limit should start more requests. + Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 5); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + ] + ); + + // Dropping the limit shouldn't start anything new. + Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + ] + ); + + // But resolving should also not start new requests. + requests[5].resolve({ + date: 345334, + url: "https://www.mozilla.org/6", + siteName: "Test 6", + data: {}, + }); + + requests[0].resolve({ + date: 343446434, + url: "https://www.mozilla.org/1", + siteName: "Test 1", + data: {}, + }); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + ] + ); + + // Until a previous request completes. + requests[4].resolve(null); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + "https://www.mozilla.org/8", + ] + ); + + // Inifinite queue should work. + Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 0); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + "https://www.mozilla.org/8", + "https://www.mozilla.org/9", + "https://www.mozilla.org/10", + "https://www.mozilla.org/11", + ] + ); + + requests[10].resolve({ + date: 345334, + url: "https://www.mozilla.org/11", + data: {}, + }); + requests[2].resolve({ + date: 345334, + url: "https://www.mozilla.org/3", + data: {}, + }); + requests[7].resolve({ + date: 345334, + url: "https://www.mozilla.org/8", + data: {}, + }); + requests[6].resolve({ + date: 345334, + url: "https://www.mozilla.org/7", + data: {}, + }); + requests[8].resolve({ + date: 345334, + url: "https://www.mozilla.org/9", + data: {}, + }); + requests[9].resolve({ + date: 345334, + url: "https://www.mozilla.org/10", + data: {}, + }); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + "https://www.mozilla.org/8", + "https://www.mozilla.org/9", + "https://www.mozilla.org/10", + "https://www.mozilla.org/11", + ] + ); + + PageDataService.off("page-data", listener); + + delete PageDataService.fetchPageData; + + Assert.deepEqual(results, [ + "https://www.mozilla.org/2", + "https://www.mozilla.org/6", + "https://www.mozilla.org/1", + "https://www.mozilla.org/11", + "https://www.mozilla.org/3", + "https://www.mozilla.org/8", + "https://www.mozilla.org/7", + "https://www.mozilla.org/9", + "https://www.mozilla.org/10", + ]); +}); + +// Tests that the user idle state stops and starts fetches. +add_task(async function test_idle() { + Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3); + // Pretend we are active. + PageDataService.observe(null, "active", null); + + let requests = []; + PageDataService.fetchPageData = url => { + let { promise, resolve, reject } = Promise.withResolvers(); + requests.push({ url, resolve, reject }); + + return promise; + }; + + let results = []; + let listener = (_, pageData) => { + results.push(pageData?.url); + }; + + PageDataService.on("page-data", listener); + + PageDataService.queueFetch("https://www.mozilla.org/1"); + PageDataService.queueFetch("https://www.mozilla.org/2"); + PageDataService.queueFetch("https://www.mozilla.org/3"); + PageDataService.queueFetch("https://www.mozilla.org/4"); + PageDataService.queueFetch("https://www.mozilla.org/5"); + PageDataService.queueFetch("https://www.mozilla.org/6"); + PageDataService.queueFetch("https://www.mozilla.org/7"); + + await TestUtils.waitForTick(); + + // Nothing will start when active. + Assert.deepEqual( + requests.map(r => r.url), + [] + ); + + // Pretend we are idle. + PageDataService.observe(null, "idle", null); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + ] + ); + + // Completing or rejecting a request should start new ones. + + requests[1].resolve({ + date: 2345, + url: "https://www.mozilla.org/2", + data: {}, + }); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + ] + ); + + // But not when active + PageDataService.observe(null, "active", null); + + requests[3].resolve({ + date: 2345, + url: "https://www.mozilla.org/4", + data: {}, + }); + requests[0].resolve({ + date: 2345, + url: "https://www.mozilla.org/1", + data: {}, + }); + requests[2].resolve({ + date: 2345, + url: "https://www.mozilla.org/3", + data: {}, + }); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + ] + ); + + // Going idle should start more workers + PageDataService.observe(null, "idle", null); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + ] + ); + + requests[4].resolve({ + date: 2345, + url: "https://www.mozilla.org/5", + data: {}, + }); + requests[5].resolve({ + date: 2345, + url: "https://www.mozilla.org/6", + data: {}, + }); + requests[6].resolve({ + date: 2345, + url: "https://www.mozilla.org/7", + data: {}, + }); + + await TestUtils.waitForTick(); + + Assert.deepEqual( + requests.map(r => r.url), + [ + "https://www.mozilla.org/1", + "https://www.mozilla.org/2", + "https://www.mozilla.org/3", + "https://www.mozilla.org/4", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + ] + ); + + PageDataService.off("page-data", listener); + + delete PageDataService.fetchPageData; + + Assert.deepEqual(results, [ + "https://www.mozilla.org/2", + "https://www.mozilla.org/4", + "https://www.mozilla.org/1", + "https://www.mozilla.org/3", + "https://www.mozilla.org/5", + "https://www.mozilla.org/6", + "https://www.mozilla.org/7", + ]); +}); diff --git a/browser/components/pagedata/tests/unit/test_schemaorg.js b/browser/components/pagedata/tests/unit/test_schemaorg.js new file mode 100644 index 0000000000..5470410e4f --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_schemaorg.js @@ -0,0 +1,213 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Tests that the page data service can parse schema.org metadata into PageData. + */ + +add_task(async function test_single_product_microdata() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <title>Product Info 1</title> + </head> + <body> + <div itemscope itemtype="https://schema.org/Organization"> + <div itemprop="employee" itemscope itemtype="https://schema.org/Person"> + <span itemprop="name">Mr. Nested Name</span> + </div> + + <span itemprop="name">Mozilla</span> + </div> + + <div itemscope itemtype="https://schema.org/Product"> + <img itemprop="image" src="bon-echo-microwave-17in.jpg" /> + <a href="microwave.html" itemprop="url"> + <span itemprop="name">Bon Echo Microwave</span> + </a> + + <div itemprop="offers" itemscope itemtype="https://schema.org/Offer"> + <span itemprop="price" content="3.50">£3.50</span> + <span itemprop="priceCurrency" content="GBP"></span> + </div> + + <span itemprop="gtin" content="13572468"></span> + + <span itemprop="description">The most amazing microwave in the world</span> + </div> + </body> + </html> + `, + { + siteName: "Mozilla", + description: "The most amazing microwave in the world", + image: BASE_URL + "/bon-echo-microwave-17in.jpg", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bon Echo Microwave", + price: { + value: 3.5, + currency: "GBP", + }, + }, + }, + } + ); +}); + +add_task(async function test_single_product_json_ld() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <script type="application/ld+json"> + { + "@context": "http://schema.org", + "@type": "Organization", + "employee": { + "@type": "Person", + "name": "Mr. Nested Name" + }, + "name": "Mozilla" + } + </script> + <script type="application/ld+json"> + { + "@context": "https://schema.org", + "@type": "Product", + "image": "bon-echo-microwave-17in.jpg", + "url": "microwave.html", + "name": "Bon Echo Microwave", + "offers": { + "@type": "Offer", + "price": "3.50", + "priceCurrency": "GBP" + }, + "gtin": "13572468", + "description": "The most amazing microwave in the world" + } + </script> + </head> + <body> + </body> + </html> + `, + { + siteName: "Mozilla", + description: "The most amazing microwave in the world", + image: BASE_URL + "/bon-echo-microwave-17in.jpg", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bon Echo Microwave", + price: { + value: 3.5, + currency: "GBP", + }, + }, + }, + } + ); +}); + +add_task(async function test_single_product_combined() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <script type="application/ld+json"> + { + "@context": "https://schema.org", + "@type": "Product", + "image": "bon-echo-microwave-17in.jpg", + "url": "microwave.html", + "name": "Bon Echo Microwave", + "offers": { + "@type": "Offer", + "price": "3.50", + "priceCurrency": "GBP" + }, + "gtin": "13572468", + "description": "The most amazing microwave in the world" + } + </script> + </head> + <body> + <div itemscope itemtype="https://schema.org/Organization"> + <div itemprop="employee" itemscope itemtype="https://schema.org/Person"> + <span itemprop="name">Mr. Nested Name</span> + </div> + + <span itemprop="name">Mozilla</span> + </div> + </body> + </html> + `, + { + siteName: "Mozilla", + description: "The most amazing microwave in the world", + image: BASE_URL + "/bon-echo-microwave-17in.jpg", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bon Echo Microwave", + price: { + value: 3.5, + currency: "GBP", + }, + }, + }, + } + ); +}); + +add_task(async function test_single_multiple_microdata() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <title>Product Info 2</title> + </head> + <body> + <div itemscope itemtype="https://schema.org/Product"> + <img itemprop="image" src="bon-echo-microwave-17in.jpg" /> + <a href="microwave.html" itemprop="url"> + <span itemprop="name">Bon Echo Microwave</span> + </a> + + <div itemprop="offers" itemscope itemtype="https://schema.org/Offer"> + <span itemprop="price" content="3.28">£3.28</span> + <span itemprop="priceCurrency" content="GBP"></span> + </div> + + <span itemprop="gtin" content="13572468"></span> + </div> + <div itemscope itemtype="http://schema.org/Product"> + <img itemprop="image" src="gran-paradiso-toaster-17in.jpg" /> + <a href="toaster.html" itemprop="url"> + <span itemprop="name">Gran Paradiso Toaster</span> + </a> + + <span itemprop="gtin" content="15263748"></span> + </div> + </body> + </html> + `, + { + image: BASE_URL + "/bon-echo-microwave-17in.jpg", + data: { + [PageDataSchema.DATA_TYPE.PRODUCT]: { + name: "Bon Echo Microwave", + price: { + value: 3.28, + currency: "GBP", + }, + }, + }, + } + ); +}); diff --git a/browser/components/pagedata/tests/unit/test_schemaorg_parse.js b/browser/components/pagedata/tests/unit/test_schemaorg_parse.js new file mode 100644 index 0000000000..e002598af2 --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_schemaorg_parse.js @@ -0,0 +1,193 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Tests that the page data service can parse schema.org metadata into Item + * structures. + */ + +const { SchemaOrgPageData } = ChromeUtils.importESModule( + "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs" +); + +/** + * Collects the schema.org items from the given html string. + * + * @param {string} docStr + * The html to parse. + * @returns {Promise<Item[]>} + */ +async function collectItems(docStr) { + let doc = await parseDocument(docStr); + return SchemaOrgPageData.collectItems(doc); +} + +/** + * Verifies that the items parsed from the html match the expected JSON-LD + * format. + * + * @param {string} docStr + * The html to parse. + * @param {object[]} expected + * The JSON-LD objects to match to. + */ +async function verifyItems(docStr, expected) { + let items = await collectItems(docStr); + let jsonLD = items.map(item => item.toJsonLD()); + Assert.deepEqual(jsonLD, expected); +} + +add_task(async function test_microdata_parse() { + await verifyItems( + ` + <!DOCTYPE html> + <html> + <head> + <title>Product Info 1</title> + </head> + <body itemprop="badprop"> + <div itemscope itemtype="https://schema.org/Organization"> + <div itemprop="employee" itemscope itemtype="https://schema.org/Person"> + <span itemprop="name">Mr. Nested Name</span> + </div> + + <span itemprop="name">Mozilla</span> + </div> + + <div itemscope itemtype="https://schema.org/Product"> + <img itemprop="image" src="bon-echo-microwave-17in.jpg" /> + <a href="microwave.html" itemprop="url"> + <span itemprop="name">Bon Echo Microwave</span> + </a> + + <div itemprop="offers" itemscope itemtype="https://schema.org/Offer"> + <span itemprop="price" content="3.50">£3.50</span> + <span itemprop="priceCurrency" content="GBP"></span> + </div> + + <span itemprop="gtin" content="13572468"></span> + + <span itemprop="description">The most amazing microwave in the world</span> + </div> + </body> + </html> + `, + [ + { + "@type": "Organization", + employee: { + "@type": "Person", + name: "Mr. Nested Name", + }, + name: "Mozilla", + }, + { + "@type": "Product", + image: BASE_URL + "/bon-echo-microwave-17in.jpg", + url: BASE_URL + "/microwave.html", + name: "Bon Echo Microwave", + offers: { + "@type": "Offer", + price: "3.50", + priceCurrency: "GBP", + }, + gtin: "13572468", + description: "The most amazing microwave in the world", + }, + ] + ); +}); + +add_task(async function test_json_ld_parse() { + await verifyItems( + ` + <!DOCTYPE html> + <html> + <head> + <script type="application/ld+json"> + { + "@context": "http://schema.org", + "@type": "Organization", + "employee": { + "@type": "Person", + "name": "Mr. Nested Name" + }, + "name": "Mozilla" + } + </script> + <script type="application/ld+json"> + { + "@context": "https://schema.org", + "@type": "Product", + "image": "bon-echo-microwave-17in.jpg", + "url": "microwave.html", + "name": "Bon Echo Microwave", + "offers": { + "@type": "Offer", + "price": "3.50", + "priceCurrency": "GBP" + }, + "gtin": "13572468", + "description": "The most amazing microwave in the world" + } + </script> + </head> + <body> + </body> + </html> + `, + [ + { + "@type": "Organization", + employee: { + "@type": "Person", + name: "Mr. Nested Name", + }, + name: "Mozilla", + }, + { + "@type": "Product", + image: "bon-echo-microwave-17in.jpg", + url: "microwave.html", + name: "Bon Echo Microwave", + offers: { + "@type": "Offer", + price: "3.50", + priceCurrency: "GBP", + }, + gtin: "13572468", + description: "The most amazing microwave in the world", + }, + ] + ); +}); + +add_task(async function test_microdata_lazy_image() { + await verifyItems( + ` + <!DOCTYPE html> + <html> + <head> + <title>Product Info 1</title> + </head> + <body itemprop="badprop"> + <div itemscope itemtype="https://schema.org/Product"> + <img itemprop="image" src="lazy-load.gif" data-src="bon-echo-microwave-17in.jpg" /> + <a href="microwave.html" itemprop="url"> + <span itemprop="name">Bon Echo Microwave</span> + </a> + </div> + </body> + </html> + `, + [ + { + "@type": "Product", + image: BASE_URL + "/bon-echo-microwave-17in.jpg", + url: BASE_URL + "/microwave.html", + name: "Bon Echo Microwave", + }, + ] + ); +}); diff --git a/browser/components/pagedata/tests/unit/test_twitter.js b/browser/components/pagedata/tests/unit/test_twitter.js new file mode 100644 index 0000000000..a49491f5c6 --- /dev/null +++ b/browser/components/pagedata/tests/unit/test_twitter.js @@ -0,0 +1,34 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Basic tests for twitter cards. + */ + +add_task(async function test_twitter_card() { + await verifyPageData( + ` + <!DOCTYPE html> + <html> + <head> + <meta name="twitter:card" content="summary_large_image"> + <meta name="twitter:site" content="@nytimes"> + <meta name="twitter:creator" content="@SarahMaslinNir"> + <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral"> + <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines"> + <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg"> + </head> + <body> + </body> + </html> + `, + { + siteName: "@nytimes", + description: "NEWARK - The guest list and parade of limousines", + image: + "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg", + data: {}, + } + ); +}); diff --git a/browser/components/pagedata/tests/unit/xpcshell.toml b/browser/components/pagedata/tests/unit/xpcshell.toml new file mode 100644 index 0000000000..a04ab47455 --- /dev/null +++ b/browser/components/pagedata/tests/unit/xpcshell.toml @@ -0,0 +1,19 @@ +[DEFAULT] +firefox-appdir = "browser" +skip-if = ["os == 'android'"] # bug 1730213 +support-files = ["head.js"] +head = "head.js" + +["test_opengraph.js"] + +["test_pagedata_basic.js"] + +["test_pagedata_schema.js"] + +["test_queue.js"] + +["test_schemaorg.js"] + +["test_schemaorg_parse.js"] + +["test_twitter.js"] |