summaryrefslogtreecommitdiffstats
path: root/browser/components/pagedata
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /browser/components/pagedata
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'browser/components/pagedata')
-rw-r--r--browser/components/pagedata/.eslintrc.js14
-rw-r--r--browser/components/pagedata/OpenGraphPageData.sys.mjs46
-rw-r--r--browser/components/pagedata/PageDataChild.sys.mjs121
-rw-r--r--browser/components/pagedata/PageDataParent.sys.mjs57
-rw-r--r--browser/components/pagedata/PageDataSchema.sys.mjs251
-rw-r--r--browser/components/pagedata/PageDataService.sys.mjs681
-rw-r--r--browser/components/pagedata/SchemaOrgPageData.sys.mjs441
-rw-r--r--browser/components/pagedata/TwitterPageData.sys.mjs42
-rw-r--r--browser/components/pagedata/docs/index.md50
-rw-r--r--browser/components/pagedata/jar.mn6
-rw-r--r--browser/components/pagedata/moz.build29
-rw-r--r--browser/components/pagedata/schemas/article.schema.json26
-rw-r--r--browser/components/pagedata/schemas/audio.schema.json34
-rw-r--r--browser/components/pagedata/schemas/document.schema.json18
-rw-r--r--browser/components/pagedata/schemas/general.schema.json30
-rw-r--r--browser/components/pagedata/schemas/product.schema.json46
-rw-r--r--browser/components/pagedata/schemas/video.schema.json38
-rw-r--r--browser/components/pagedata/tests/browser/browser.ini14
-rw-r--r--browser/components/pagedata/tests/browser/browser_pagedata_background.js48
-rw-r--r--browser/components/pagedata/tests/browser/browser_pagedata_basic.js64
-rw-r--r--browser/components/pagedata/tests/browser/browser_pagedata_cache.js155
-rw-r--r--browser/components/pagedata/tests/browser/head.js8
-rw-r--r--browser/components/pagedata/tests/unit/head.js103
-rw-r--r--browser/components/pagedata/tests/unit/test_opengraph.js67
-rw-r--r--browser/components/pagedata/tests/unit/test_pagedata_basic.js100
-rw-r--r--browser/components/pagedata/tests/unit/test_pagedata_schema.js210
-rw-r--r--browser/components/pagedata/tests/unit/test_queue.js527
-rw-r--r--browser/components/pagedata/tests/unit/test_schemaorg.js213
-rw-r--r--browser/components/pagedata/tests/unit/test_schemaorg_parse.js193
-rw-r--r--browser/components/pagedata/tests/unit/test_twitter.js34
-rw-r--r--browser/components/pagedata/tests/unit/xpcshell.ini14
31 files changed, 3680 insertions, 0 deletions
diff --git a/browser/components/pagedata/.eslintrc.js b/browser/components/pagedata/.eslintrc.js
new file mode 100644
index 0000000000..8ead689bcc
--- /dev/null
+++ b/browser/components/pagedata/.eslintrc.js
@@ -0,0 +1,14 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+module.exports = {
+ extends: ["plugin:mozilla/require-jsdoc"],
+
+ rules: {
+ "mozilla/var-only-at-top-level": "error",
+ "no-unused-expressions": "error",
+ },
+};
diff --git a/browser/components/pagedata/OpenGraphPageData.sys.mjs b/browser/components/pagedata/OpenGraphPageData.sys.mjs
new file mode 100644
index 0000000000..8f8b361799
--- /dev/null
+++ b/browser/components/pagedata/OpenGraphPageData.sys.mjs
@@ -0,0 +1,46 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Collects Open Graph (https://opengraphprotocol.org/) related data from a page.
+ */
+export const OpenGraphPageData = {
+ /**
+ * Collects the opengraph data from the page.
+ *
+ * @param {Document} document
+ * The document to collect from
+ *
+ * @returns {PageData}
+ */
+ collect(document) {
+ let pageData = {};
+
+ // Sites can technically define an Open Graph prefix other than `og:`.
+ // However, `og:` is one of the default RDFa prefixes and it's likely
+ // uncommon that sites use a custom prefix. If we find that metadata is
+ // missing for common sites due to this issue, we could consider adding a
+ // basic RDFa parser.
+ let openGraphTags = document.querySelectorAll("meta[property^='og:'");
+
+ for (let tag of openGraphTags) {
+ // Strip "og:" from the property name.
+ let propertyName = tag.getAttribute("property").substring(3);
+
+ switch (propertyName) {
+ case "description":
+ pageData.description = tag.getAttribute("content");
+ break;
+ case "site_name":
+ pageData.siteName = tag.getAttribute("content");
+ break;
+ case "image":
+ pageData.image = tag.getAttribute("content");
+ break;
+ }
+ }
+
+ return pageData;
+ },
+};
diff --git a/browser/components/pagedata/PageDataChild.sys.mjs b/browser/components/pagedata/PageDataChild.sys.mjs
new file mode 100644
index 0000000000..51dc384526
--- /dev/null
+++ b/browser/components/pagedata/PageDataChild.sys.mjs
@@ -0,0 +1,121 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs",
+ PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs",
+});
+
+// We defer any attempt to check for page data for a short time after a page
+// loads to allow JS to operate.
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "READY_DELAY",
+ "browser.pagedata.readyDelay",
+ 500
+);
+
+/**
+ * The actor responsible for monitoring a page for page data.
+ */
+export class PageDataChild extends JSWindowActorChild {
+ #isContentWindowPrivate = true;
+ /**
+ * Used to debounce notifications about a page being ready.
+ *
+ * @type {Timer | null}
+ */
+ #deferTimer = null;
+
+ /**
+ * Called when the actor is created for a new page.
+ */
+ actorCreated() {
+ this.#isContentWindowPrivate =
+ lazy.PrivateBrowsingUtils.isContentWindowPrivate(this.contentWindow);
+ }
+
+ /**
+ * Called when the page is destroyed.
+ */
+ didDestroy() {
+ if (this.#deferTimer) {
+ this.#deferTimer.cancel();
+ }
+ }
+
+ /**
+ * Called when the page has signalled it is done loading. This signal is
+ * debounced by READY_DELAY.
+ */
+ #deferReady() {
+ if (!this.#deferTimer) {
+ this.#deferTimer = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer);
+ }
+
+ // If the timer was already running this re-starts it.
+ this.#deferTimer.initWithCallback(
+ () => {
+ this.#deferTimer = null;
+ this.sendAsyncMessage("PageData:DocumentReady", {
+ url: this.document.documentURI,
+ });
+ },
+ lazy.READY_DELAY,
+ Ci.nsITimer.TYPE_ONE_SHOT_LOW_PRIORITY
+ );
+ }
+
+ /**
+ * Called when a message is received from the parent process.
+ *
+ * @param {ReceiveMessageArgument} msg
+ * The received message.
+ *
+ * @returns {Promise | undefined}
+ * A promise for the requested data or undefined if no data was requested.
+ */
+ receiveMessage(msg) {
+ if (this.#isContentWindowPrivate) {
+ return undefined;
+ }
+
+ switch (msg.name) {
+ case "PageData:CheckLoaded":
+ // The service just started in the parent. Check if this document is
+ // already loaded.
+ if (this.document.readystate == "complete") {
+ this.#deferReady();
+ }
+ break;
+ case "PageData:Collect":
+ return lazy.PageDataSchema.collectPageData(this.document);
+ }
+
+ return undefined;
+ }
+
+ /**
+ * DOM event handler.
+ *
+ * @param {Event} event
+ * The DOM event.
+ */
+ handleEvent(event) {
+ if (this.#isContentWindowPrivate) {
+ return;
+ }
+
+ switch (event.type) {
+ case "DOMContentLoaded":
+ case "pageshow":
+ this.#deferReady();
+ break;
+ }
+ }
+}
diff --git a/browser/components/pagedata/PageDataParent.sys.mjs b/browser/components/pagedata/PageDataParent.sys.mjs
new file mode 100644
index 0000000000..c3e7743b99
--- /dev/null
+++ b/browser/components/pagedata/PageDataParent.sys.mjs
@@ -0,0 +1,57 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+ PromiseUtils: "resource://gre/modules/PromiseUtils.sys.mjs",
+});
+
+/**
+ * Receives messages from PageDataChild and passes them to the PageData service.
+ */
+export class PageDataParent extends JSWindowActorParent {
+ #deferredCollection = null;
+
+ /**
+ * Starts data collection in the child process. Returns a promise that
+ * resolves to the page data or null if the page is closed before data
+ * collection completes.
+ *
+ * @returns {Promise<PageData|null>}
+ */
+ collectPageData() {
+ if (!this.#deferredCollection) {
+ this.#deferredCollection = lazy.PromiseUtils.defer();
+ this.sendQuery("PageData:Collect").then(
+ this.#deferredCollection.resolve,
+ this.#deferredCollection.reject
+ );
+ }
+
+ return this.#deferredCollection.promise;
+ }
+
+ /**
+ * Called when the page is destroyed.
+ */
+ didDestroy() {
+ this.#deferredCollection?.resolve(null);
+ }
+
+ /**
+ * Called when a message is received from the content process.
+ *
+ * @param {ReceiveMessageArgument} msg
+ * The received message.
+ */
+ receiveMessage(msg) {
+ switch (msg.name) {
+ case "PageData:DocumentReady":
+ lazy.PageDataService.pageLoaded(this, msg.data.url);
+ break;
+ }
+ }
+}
diff --git a/browser/components/pagedata/PageDataSchema.sys.mjs b/browser/components/pagedata/PageDataSchema.sys.mjs
new file mode 100644
index 0000000000..307b906fdd
--- /dev/null
+++ b/browser/components/pagedata/PageDataSchema.sys.mjs
@@ -0,0 +1,251 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ JsonSchemaValidator:
+ "resource://gre/modules/components-utils/JsonSchemaValidator.sys.mjs",
+ OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.sys.mjs",
+ SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs",
+ TwitterPageData: "resource:///modules/pagedata/TwitterPageData.sys.mjs",
+});
+
+XPCOMUtils.defineLazyGetter(lazy, "logConsole", function () {
+ return console.createInstance({
+ prefix: "PageData",
+ maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false)
+ ? "Debug"
+ : "Warn",
+ });
+});
+
+/**
+ * The list of page data collectors. These should be sorted in order of
+ * specificity, if the same piece of data is provided by two collectors then the
+ * earlier wins.
+ *
+ * Collectors must provide a `collect` function which will be passed the
+ * document object and should return the PageData structure. The function may be
+ * asynchronous if needed.
+ *
+ * The data returned need not be valid, collectors should return whatever they
+ * can and then we drop anything that is invalid once all data is joined.
+ */
+XPCOMUtils.defineLazyGetter(lazy, "DATA_COLLECTORS", function () {
+ return [lazy.SchemaOrgPageData, lazy.OpenGraphPageData, lazy.TwitterPageData];
+});
+
+let SCHEMAS = new Map();
+
+/**
+ * Loads the schema for the given name.
+ *
+ * @param {string} schemaName
+ * The name of the schema to load.
+ */
+async function loadSchema(schemaName) {
+ if (SCHEMAS.has(schemaName)) {
+ return SCHEMAS.get(schemaName);
+ }
+
+ let url = `chrome://browser/content/pagedata/schemas/${schemaName.toLocaleLowerCase()}.schema.json`;
+ let response = await fetch(url);
+ if (!response.ok) {
+ throw new Error(`Failed to load schema: ${response.statusText}`);
+ }
+
+ let schema = await response.json();
+ SCHEMAS.set(schemaName, schema);
+ return schema;
+}
+
+/**
+ * Validates the data using the schema with the given name.
+ *
+ * @param {string} schemaName
+ * The name of the schema to validate against.
+ * @param {object} data
+ * The data to validate.
+ */
+async function validateData(schemaName, data) {
+ let schema = await loadSchema(schemaName.toLocaleLowerCase());
+
+ let result = lazy.JsonSchemaValidator.validate(data, schema, {
+ allowExplicitUndefinedProperties: true,
+ // Allowed for future expansion of the schema.
+ allowExtraProperties: true,
+ });
+
+ if (!result.valid) {
+ throw result.error;
+ }
+}
+
+/**
+ * A shared API that can be used in parent or child processes
+ */
+export const PageDataSchema = {
+ // Enumeration of data types. The keys must match the schema name.
+ DATA_TYPE: Object.freeze({
+ // Note that 1 and 2 were used as types in earlier versions and should not be used here.
+ PRODUCT: 3,
+ DOCUMENT: 4,
+ ARTICLE: 5,
+ AUDIO: 6,
+ VIDEO: 7,
+ }),
+
+ /**
+ * Gets the data type name.
+ *
+ * @param {DATA_TYPE} type
+ * The data type from the DATA_TYPE enumeration
+ *
+ * @returns {string | null} The name for the type or null if not found.
+ */
+ nameForType(type) {
+ for (let [name, value] of Object.entries(this.DATA_TYPE)) {
+ if (value == type) {
+ return name;
+ }
+ }
+
+ return null;
+ },
+
+ /**
+ * Asynchronously validates some page data against the expected schema. Throws
+ * an exception if validation fails.
+ *
+ * @param {DATA_TYPE} type
+ * The data type from the DATA_TYPE enumeration
+ * @param {object} data
+ * The page data
+ */
+ async validateData(type, data) {
+ let name = this.nameForType(type);
+
+ if (!name) {
+ throw new Error(`Unknown data type ${type}`);
+ }
+
+ return validateData(name, data);
+ },
+
+ /**
+ * Asynchronously validates an entire PageData structure. Any invalid or
+ * unknown data types are dropped.
+ *
+ * @param {PageData} pageData
+ * The page data
+ *
+ * @returns {PageData} The validated page data structure
+ */
+ async validatePageData(pageData) {
+ let { data: dataMap = {}, ...general } = pageData;
+
+ await validateData("general", general);
+
+ let validData = {};
+
+ for (let [type, data] of Object.entries(dataMap)) {
+ let name = this.nameForType(type);
+ // Ignore unknown types here.
+ if (!name) {
+ continue;
+ }
+
+ try {
+ await validateData(name, data);
+
+ validData[type] = data;
+ } catch (e) {
+ // Invalid data is dropped.
+ }
+ }
+
+ return {
+ ...general,
+ data: validData,
+ };
+ },
+
+ /**
+ * Adds new page data into an existing data set. Any existing data is not
+ * overwritten.
+ *
+ * @param {PageData} existingPageData
+ * The existing page data
+ * @param {PageData} newPageData
+ * The new page data
+ *
+ * @returns {PageData} The joined data.
+ */
+ coalescePageData(existingPageData, newPageData) {
+ // Split out the general data from the map of specific data.
+ let { data: existingMap = {}, ...existingGeneral } = existingPageData;
+ let { data: newMap = {}, ...newGeneral } = newPageData;
+
+ Object.assign(newGeneral, existingGeneral);
+
+ let dataMap = {};
+ for (let [type, data] of Object.entries(existingMap)) {
+ if (type in newMap) {
+ dataMap[type] = Object.assign({}, newMap[type], data);
+ } else {
+ dataMap[type] = data;
+ }
+ }
+
+ for (let [type, data] of Object.entries(newMap)) {
+ if (!(type in dataMap)) {
+ dataMap[type] = data;
+ }
+ }
+
+ return {
+ ...newGeneral,
+ data: dataMap,
+ };
+ },
+
+ /**
+ * Collects page data from a DOM document.
+ *
+ * @param {Document} document
+ * The DOM document to collect data from
+ *
+ * @returns {Promise<PageData | null>} The data collected or null in case of
+ * error.
+ */
+ async collectPageData(document) {
+ lazy.logConsole.debug("Starting collection", document.documentURI);
+
+ let pending = lazy.DATA_COLLECTORS.map(async collector => {
+ try {
+ return await collector.collect(document);
+ } catch (e) {
+ lazy.logConsole.error("Error collecting page data", e);
+ return null;
+ }
+ });
+
+ let pageDataList = await Promise.all(pending);
+
+ let pageData = pageDataList.reduce(PageDataSchema.coalescePageData, {
+ date: Date.now(),
+ url: document.documentURI,
+ });
+
+ try {
+ return this.validatePageData(pageData);
+ } catch (e) {
+ lazy.logConsole.error("Failed to collect valid page data", e);
+ return null;
+ }
+ },
+};
diff --git a/browser/components/pagedata/PageDataService.sys.mjs b/browser/components/pagedata/PageDataService.sys.mjs
new file mode 100644
index 0000000000..5a3c67ee0b
--- /dev/null
+++ b/browser/components/pagedata/PageDataService.sys.mjs
@@ -0,0 +1,681 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+import { EventEmitter } from "resource://gre/modules/EventEmitter.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ E10SUtils: "resource://gre/modules/E10SUtils.sys.mjs",
+ HiddenFrame: "resource://gre/modules/HiddenFrame.sys.mjs",
+ PromiseUtils: "resource://gre/modules/PromiseUtils.sys.mjs",
+});
+
+XPCOMUtils.defineLazyModuleGetters(lazy, {
+ BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.jsm",
+});
+
+XPCOMUtils.defineLazyGetter(lazy, "logConsole", function () {
+ return console.createInstance({
+ prefix: "PageData",
+ maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false)
+ ? "Debug"
+ : "Warn",
+ });
+});
+
+XPCOMUtils.defineLazyServiceGetters(lazy, {
+ idleService: ["@mozilla.org/widget/useridleservice;1", "nsIUserIdleService"],
+});
+
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "fetchIdleTime",
+ "browser.pagedata.fetchIdleTime",
+ 300
+);
+
+const ALLOWED_SCHEMES = ["http", "https", "data", "blob"];
+
+const BACKGROUND_WIDTH = 1024;
+const BACKGROUND_HEIGHT = 768;
+
+/**
+ * Shifts the first element out of the set.
+ *
+ * @param {Set<T>} set
+ * The set containing elements.
+ * @returns {T | undefined} The first element in the set or undefined if
+ * there is nothing in the set.
+ */
+function shift(set) {
+ let iter = set.values();
+ let { value, done } = iter.next();
+
+ if (done) {
+ return undefined;
+ }
+
+ set.delete(value);
+ return value;
+}
+
+/**
+ * A manager for hidden browsers. Responsible for creating and destroying a
+ * hidden frame to hold them.
+ */
+class HiddenBrowserManager {
+ /**
+ * The hidden frame if one has been created.
+ *
+ * @type {HiddenFrame | null}
+ */
+ #frame = null;
+ /**
+ * The number of hidden browser elements currently in use.
+ *
+ * @type {number}
+ */
+ #browsers = 0;
+
+ /**
+ * Creates and returns a new hidden browser.
+ *
+ * @returns {Browser}
+ */
+ async #acquireBrowser() {
+ this.#browsers++;
+ if (!this.#frame) {
+ this.#frame = new lazy.HiddenFrame();
+ }
+
+ let frame = await this.#frame.get();
+ let doc = frame.document;
+ let browser = doc.createXULElement("browser");
+ browser.setAttribute("remote", "true");
+ browser.setAttribute("type", "content");
+ browser.setAttribute(
+ "style",
+ `
+ width: ${BACKGROUND_WIDTH}px;
+ min-width: ${BACKGROUND_WIDTH}px;
+ height: ${BACKGROUND_HEIGHT}px;
+ min-height: ${BACKGROUND_HEIGHT}px;
+ `
+ );
+ browser.setAttribute("maychangeremoteness", "true");
+ doc.documentElement.appendChild(browser);
+
+ return browser;
+ }
+
+ /**
+ * Releases the given hidden browser.
+ *
+ * @param {Browser} browser
+ * The hidden browser element.
+ */
+ #releaseBrowser(browser) {
+ browser.remove();
+
+ this.#browsers--;
+ if (this.#browsers == 0) {
+ this.#frame.destroy();
+ this.#frame = null;
+ }
+ }
+
+ /**
+ * Calls a callback function with a new hidden browser.
+ * This function will return whatever the callback function returns.
+ *
+ * @param {Callback} callback
+ * The callback function will be called with the browser element and may
+ * be asynchronous.
+ * @returns {T}
+ */
+ async withHiddenBrowser(callback) {
+ let browser = await this.#acquireBrowser();
+ try {
+ return await callback(browser);
+ } finally {
+ this.#releaseBrowser(browser);
+ }
+ }
+}
+
+/**
+ * @typedef {object} CacheEntry
+ * An entry in the page data cache.
+ * @property {PageData | null} pageData
+ * The data or null if there is no known data.
+ * @property {Set} actors
+ * The actors that maintain an interest in keeping the entry cached.
+ */
+
+/**
+ * A cache of page data kept in memory. By default any discovered data from
+ * browsers is kept in memory until the browser element is destroyed but other
+ * actors may register an interest in keeping an entry alive beyond that.
+ */
+class PageDataCache {
+ /**
+ * The contents of the cache. Keyed on page url.
+ *
+ * @type {Map<string, CacheEntry>}
+ */
+ #cache = new Map();
+
+ /**
+ * Creates or updates an entry in the cache. If no actor has registered any
+ * interest in keeping this page's data in memory then this will do nothing.
+ *
+ * @param {string} url
+ * The url of the page.
+ * @param {PageData|null} pageData
+ * The current page data for the page.
+ */
+ set(url, pageData) {
+ let entry = this.#cache.get(url);
+
+ if (entry) {
+ entry.pageData = pageData;
+ }
+ }
+
+ /**
+ * Gets any cached data for the url.
+ *
+ * @param {string} url
+ * The url of the page.
+ * @returns {PageData | null}
+ * The page data if some is known.
+ */
+ get(url) {
+ let entry = this.#cache.get(url);
+ return entry?.pageData ?? null;
+ }
+
+ /**
+ * Adds a lock to an entry. This can be called before we have discovered the
+ * data for the url.
+ *
+ * @param {object} actor
+ * Ensures the entry stays in memory until unlocked by this actor.
+ * @param {string} url
+ * The url of the page.
+ */
+ lockData(actor, url) {
+ let entry = this.#cache.get(url);
+ if (entry) {
+ entry.actors.add(actor);
+ } else {
+ this.#cache.set(url, {
+ pageData: undefined,
+ actors: new Set([actor]),
+ });
+ }
+ }
+
+ /**
+ * Removes a lock from an entry.
+ *
+ * @param {object} actor
+ * The lock to remove.
+ * @param {string | undefined} [url]
+ * The url of the page or undefined to unlock all urls locked by this actor.
+ */
+ unlockData(actor, url) {
+ let entries = [];
+ if (url) {
+ let entry = this.#cache.get(url);
+ if (!entry) {
+ return;
+ }
+
+ entries.push([url, entry]);
+ } else {
+ entries = [...this.#cache];
+ }
+
+ for (let [entryUrl, entry] of entries) {
+ if (entry.actors.delete(actor)) {
+ if (entry.actors.size == 0) {
+ this.#cache.delete(entryUrl);
+ }
+ }
+ }
+ }
+}
+
+/**
+ * @typedef {object} PageData
+ * A set of discovered from a page. Other than the `data` property this is the
+ * schema at `browser/components/pagedata/schemas/general.schema.json`.
+ * @property {string} url
+ * The page's url.
+ * @property {number} date
+ * The epoch based timestamp for when the data was discovered.
+ * @property {string} siteName
+ * The page's friendly site name.
+ * @property {string} image
+ * The page's image.
+ * @property {object} data
+ * The map of data found which may be empty if no data was found. The key in
+ * map is from the `PageDataSchema.DATA_TYPE` enumeration. The values are in
+ * the format defined by the schemas at `browser/components/pagedata/schemas`.
+ */
+
+export const PageDataService = new (class PageDataService extends EventEmitter {
+ /**
+ * Caches page data discovered from browsers.
+ *
+ * @type {PageDataCache}
+ */
+ #pageDataCache = new PageDataCache();
+
+ /**
+ * The number of currently running background fetches.
+ *
+ * @type {number}
+ */
+ #backgroundFetches = 0;
+
+ /**
+ * The list of urls waiting to be loaded in the background.
+ *
+ * @type {Set<string>}
+ */
+ #backgroundQueue = new Set();
+
+ /**
+ * Tracks whether the user is currently idle.
+ *
+ * @type {boolean}
+ */
+ #userIsIdle = false;
+
+ /**
+ * A manager for hidden browsers.
+ *
+ * @type {HiddenBrowserManager}
+ */
+ #browserManager = new HiddenBrowserManager();
+
+ /**
+ * A map of hidden browsers to a resolve function that should be passed the
+ * actor that was created for the browser.
+ *
+ * @type {WeakMap<Browser, function(PageDataParent): void>}
+ */
+ #backgroundBrowsers = new WeakMap();
+
+ /**
+ * Tracks windows that have browsers with entries in the cache.
+ *
+ * @type {Map<Window, Set<Browser>>}
+ */
+ #trackedWindows = new Map();
+
+ /**
+ * Constructs the service.
+ */
+ constructor() {
+ super();
+
+ // Limits the number of background fetches that will run at once. Set to 0 to
+ // effectively allow an infinite number.
+ XPCOMUtils.defineLazyPreferenceGetter(
+ this,
+ "MAX_BACKGROUND_FETCHES",
+ "browser.pagedata.maxBackgroundFetches",
+ 5,
+ () => this.#startBackgroundWorkers()
+ );
+ }
+
+ /**
+ * Initializes a new instance of the service, not called externally.
+ */
+ init() {
+ if (!Services.prefs.getBoolPref("browser.pagedata.enabled", false)) {
+ return;
+ }
+
+ ChromeUtils.registerWindowActor("PageData", {
+ parent: {
+ esModuleURI: "resource:///actors/PageDataParent.sys.mjs",
+ },
+ child: {
+ esModuleURI: "resource:///actors/PageDataChild.sys.mjs",
+ events: {
+ DOMContentLoaded: {},
+ pageshow: {},
+ },
+ },
+ });
+
+ lazy.logConsole.debug("Service started");
+
+ for (let win of lazy.BrowserWindowTracker.orderedWindows) {
+ if (!win.closed) {
+ // Ask any existing tabs to report
+ for (let tab of win.gBrowser.tabs) {
+ let parent =
+ tab.linkedBrowser.browsingContext?.currentWindowGlobal.getActor(
+ "PageData"
+ );
+
+ parent.sendAsyncMessage("PageData:CheckLoaded");
+ }
+ }
+ }
+
+ lazy.idleService.addIdleObserver(this, lazy.fetchIdleTime);
+ }
+
+ /**
+ * Called when the service is destroyed. This is generally on shutdown so we
+ * don't really need to do much cleanup.
+ */
+ uninit() {
+ lazy.logConsole.debug("Service stopped");
+ }
+
+ /**
+ * Starts tracking for when a browser is destroyed.
+ *
+ * @param {Browser} browser
+ * The browser to track.
+ */
+ #trackBrowser(browser) {
+ let window = browser.ownerGlobal;
+
+ let browsers = this.#trackedWindows.get(window);
+ if (browsers) {
+ browsers.add(browser);
+
+ // This window is already being tracked, no need to add listeners.
+ return;
+ }
+
+ browsers = new Set([browser]);
+ this.#trackedWindows.set(window, browsers);
+
+ window.addEventListener("unload", () => {
+ for (let closedBrowser of browsers) {
+ this.unlockEntry(closedBrowser);
+ }
+
+ this.#trackedWindows.delete(window);
+ });
+
+ window.addEventListener("TabClose", ({ target: tab }) => {
+ // Unlock any entries locked by this browser.
+ let closedBrowser = tab.linkedBrowser;
+ this.unlockEntry(closedBrowser);
+ browsers.delete(closedBrowser);
+ });
+ }
+
+ /**
+ * Requests that any page data for this url is retained in memory until
+ * unlocked. By calling this you are committing to later call `unlockEntry`
+ * with the same `actor` and `url` parameters.
+ *
+ * @param {object} actor
+ * The actor requesting the lock.
+ * @param {string} url
+ * The url of the page to lock.
+ */
+ lockEntry(actor, url) {
+ this.#pageDataCache.lockData(actor, url);
+ }
+
+ /**
+ * Notifies that an actor is no longer interested in a url.
+ *
+ * @param {object} actor
+ * The actor that requested the lock.
+ * @param {string | undefined} [url]
+ * The url of the page or undefined to unlock all urls locked by this actor.
+ */
+ unlockEntry(actor, url) {
+ this.#pageDataCache.unlockData(actor, url);
+ }
+
+ /**
+ * Called when the content process signals that a page is ready for data
+ * collection.
+ *
+ * @param {PageDataParent} actor
+ * The parent actor for the page.
+ * @param {string} url
+ * The url of the page.
+ */
+ async pageLoaded(actor, url) {
+ let uri = Services.io.newURI(url);
+ if (!ALLOWED_SCHEMES.includes(uri.scheme)) {
+ return;
+ }
+
+ let browser = actor.browsingContext?.embedderElement;
+
+ // If we don't have a browser then it went away before we could record,
+ // so we don't know where the data came from.
+ if (!browser) {
+ return;
+ }
+
+ // Is this a load in a background browser?
+ let backgroundResolve = this.#backgroundBrowsers.get(browser);
+ if (backgroundResolve) {
+ backgroundResolve(actor);
+ return;
+ }
+
+ // Otherwise we only care about pages loaded in the tab browser.
+ if (!this.#isATabBrowser(browser)) {
+ return;
+ }
+
+ try {
+ let data = await actor.collectPageData();
+ if (data) {
+ // Keep this data alive until the browser is destroyed.
+ this.#trackBrowser(browser);
+ this.lockEntry(browser, data.url);
+
+ this.pageDataDiscovered(data);
+ }
+ } catch (e) {
+ lazy.logConsole.error(e);
+ }
+ }
+
+ /**
+ * Adds data for a url. This should generally only be called by other components of the
+ * page data service or tests for simulating page data collection.
+ *
+ * @param {PageData} pageData
+ * The set of data discovered.
+ */
+ pageDataDiscovered(pageData) {
+ lazy.logConsole.debug("Discovered page data", pageData);
+
+ this.#pageDataCache.set(pageData.url, {
+ ...pageData,
+ data: pageData.data ?? {},
+ });
+
+ // Send out a notification.
+ this.emit("page-data", pageData);
+ }
+
+ /**
+ * Retrieves any cached page data. Returns null if there is no information in the cache, this will
+ * happen either if the page has not been browsed recently or if data collection failed for some
+ * reason.
+ *
+ * @param {string} url
+ * The url to retrieve data for.
+ * @returns {PageData|null}
+ * A `PageData` if one is cached (it may not actually contain any items of data) or null if this
+ * page has not been successfully checked for data recently.
+ */
+ getCached(url) {
+ return this.#pageDataCache.get(url);
+ }
+
+ /**
+ * Fetches page data from the given URL using a hidden window. Note that this does not populate
+ * the page data cache or emit the `page-data` event.
+ *
+ * @param {string} url
+ * The url to retrieve data for.
+ * @returns {Promise<PageData|null>}
+ * Resolves to the found pagedata or null in case of error.
+ */
+ async fetchPageData(url) {
+ return this.#browserManager.withHiddenBrowser(async browser => {
+ try {
+ let { promise, resolve } = lazy.PromiseUtils.defer();
+ this.#backgroundBrowsers.set(browser, resolve);
+
+ let principal = Services.scriptSecurityManager.getSystemPrincipal();
+ let oa = lazy.E10SUtils.predictOriginAttributes({
+ browser,
+ });
+ let loadURIOptions = {
+ triggeringPrincipal: principal,
+ remoteType: lazy.E10SUtils.getRemoteTypeForURI(
+ url,
+ true,
+ false,
+ lazy.E10SUtils.DEFAULT_REMOTE_TYPE,
+ null,
+ oa
+ ),
+ };
+ browser.fixupAndLoadURIString(url, loadURIOptions);
+
+ let actor = await promise;
+ return await actor.collectPageData();
+ } finally {
+ this.#backgroundBrowsers.delete(browser);
+ }
+ });
+ }
+
+ /**
+ * Handles notifications from the idle service.
+ *
+ * @param {nsISupports} subject
+ * The notification's subject.
+ * @param {string} topic
+ * The notification topic.
+ * @param {string} data
+ * The data associated with the notification.
+ */
+ observe(subject, topic, data) {
+ switch (topic) {
+ case "idle":
+ lazy.logConsole.debug("User went idle");
+ this.#userIsIdle = true;
+ this.#startBackgroundWorkers();
+ break;
+ case "active":
+ lazy.logConsole.debug("User became active");
+ this.#userIsIdle = false;
+ break;
+ }
+ }
+
+ /**
+ * Starts as many background workers as are allowed to process the background
+ * queue.
+ */
+ #startBackgroundWorkers() {
+ if (!this.#userIsIdle) {
+ return;
+ }
+
+ let toStart;
+
+ if (this.MAX_BACKGROUND_FETCHES) {
+ toStart = this.MAX_BACKGROUND_FETCHES - this.#backgroundFetches;
+ } else {
+ toStart = this.#backgroundQueue.size;
+ }
+
+ for (let i = 0; i < toStart; i++) {
+ this.#backgroundFetch();
+ }
+ }
+
+ /**
+ * Starts a background fetch worker which will pull urls from the queue and
+ * load them until the queue is empty.
+ */
+ async #backgroundFetch() {
+ this.#backgroundFetches++;
+
+ let url = shift(this.#backgroundQueue);
+ while (url) {
+ try {
+ let pageData = await this.fetchPageData(url);
+
+ if (pageData) {
+ this.#pageDataCache.set(url, pageData);
+ this.emit("page-data", pageData);
+ }
+ } catch (e) {
+ lazy.logConsole.error(e);
+ }
+
+ // Check whether the user became active or the worker limit changed
+ // dynamically.
+ if (
+ !this.#userIsIdle ||
+ (this.MAX_BACKGROUND_FETCHES > 0 &&
+ this.#backgroundFetches > this.MAX_BACKGROUND_FETCHES)
+ ) {
+ break;
+ }
+
+ url = shift(this.#backgroundQueue);
+ }
+
+ this.#backgroundFetches--;
+ }
+
+ /**
+ * Queues page data retrieval for a url. The page-data notification will be
+ * generated if data becomes available.
+ *
+ * Check `getCached` first to ensure that data is not already in the cache.
+ *
+ * @param {string} url
+ * The url to retrieve data for.
+ */
+ queueFetch(url) {
+ this.#backgroundQueue.add(url);
+
+ this.#startBackgroundWorkers();
+ }
+
+ /**
+ * Determines if the given browser is contained within a tab.
+ *
+ * @param {DOMElement} browser
+ * The browser element to check.
+ * @returns {boolean}
+ * True if the browser element is contained within a tab.
+ */
+ #isATabBrowser(browser) {
+ return browser.ownerGlobal.gBrowser?.getTabForBrowser(browser);
+ }
+})();
diff --git a/browser/components/pagedata/SchemaOrgPageData.sys.mjs b/browser/components/pagedata/SchemaOrgPageData.sys.mjs
new file mode 100644
index 0000000000..449572c76f
--- /dev/null
+++ b/browser/components/pagedata/SchemaOrgPageData.sys.mjs
@@ -0,0 +1,441 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { PageDataSchema } from "resource:///modules/pagedata/PageDataSchema.sys.mjs";
+
+/**
+ * Represents an item from the schema.org specification.
+ *
+ * Every `Item` has a type and a set of properties. Each property has a string
+ * name and a list of values. It often isn't clear from the spec whether a
+ * property is expected to have a list of values or just one value so this
+ * data structure stores every property as a list and provides a simple method
+ * to get the first property value.
+ */
+class Item {
+ /** @type {string} The type of the item e.g. "Product" or "Person". */
+ type;
+
+ /** @type {Map<string, any[]>} Properties of the item. */
+ properties = new Map();
+
+ /**
+ * Constructors a new `Item` of the given type.
+ *
+ * @param {string} type
+ * The type of the item.
+ */
+ constructor(type) {
+ this.type = type;
+ }
+
+ /**
+ * Tests whether a property has any values in this item.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @returns {boolean}
+ */
+ has(prop) {
+ return this.properties.has(prop);
+ }
+
+ /**
+ * Gets all of the values for a property. This may return an empty array if
+ * there are no values.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @returns {any[]}
+ */
+ all(prop) {
+ return this.properties.get(prop) ?? [];
+ }
+
+ /**
+ * Gets the first value for a property.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @returns {any}
+ */
+ get(prop) {
+ return this.properties.get(prop)?.[0];
+ }
+
+ /**
+ * Sets a value for a property.
+ *
+ * @param {string} prop
+ * The name of the property.
+ * @param {any} value
+ * The value of the property.
+ */
+ set(prop, value) {
+ let props = this.properties.get(prop);
+ if (props === undefined) {
+ props = [];
+ this.properties.set(prop, props);
+ }
+
+ props.push(value);
+ }
+
+ /**
+ * Converts this item to JSON-LD.
+ *
+ * Single array properties are converted into simple properties.
+ *
+ * @returns {object}
+ */
+ toJsonLD() {
+ /**
+ * Converts a value to its JSON-LD representation.
+ *
+ * @param {any} val
+ * The value to convert.
+ * @returns {any}
+ */
+ function toLD(val) {
+ if (val instanceof Item) {
+ return val.toJsonLD();
+ }
+ return val;
+ }
+
+ let props = Array.from(this.properties, ([key, value]) => {
+ if (value.length == 1) {
+ return [key, toLD(value[0])];
+ }
+
+ return [key, value.map(toLD)];
+ });
+
+ return {
+ "@type": this.type,
+ ...Object.fromEntries(props),
+ };
+ }
+}
+
+/**
+ * Parses the value for a given microdata property.
+ * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
+ *
+ * @param {Element} propElement
+ * The property element.
+ * @returns {any}
+ * The value of the property.
+ */
+function parseMicrodataProp(propElement) {
+ if (propElement.hasAttribute("itemscope")) {
+ throw new Error(
+ "Cannot parse a simple property value from an itemscope element."
+ );
+ }
+
+ const parseUrl = (urlElement, attr) => {
+ if (!urlElement.hasAttribute(attr)) {
+ return "";
+ }
+
+ try {
+ let url = new URL(
+ urlElement.getAttribute(attr),
+ urlElement.ownerDocument.documentURI
+ );
+ return url.toString();
+ } catch (e) {
+ return "";
+ }
+ };
+
+ switch (propElement.localName) {
+ case "meta":
+ return propElement.getAttribute("content") ?? "";
+ case "audio":
+ case "embed":
+ case "iframe":
+ case "source":
+ case "track":
+ case "video":
+ return parseUrl(propElement, "src");
+ case "img":
+ // Some pages may be using a lazy loading approach to images, putting a
+ // temporary image in "src" while the real image is in a differently
+ // named attribute. So far we found "content" and "data-src" are common
+ // names for that attribute.
+ return (
+ parseUrl(propElement, "content") ||
+ parseUrl(propElement, "data-src") ||
+ parseUrl(propElement, "src")
+ );
+ case "object":
+ return parseUrl(propElement, "data");
+ case "a":
+ case "area":
+ case "link":
+ return parseUrl(propElement, "href");
+ case "data":
+ case "meter":
+ return propElement.getAttribute("value");
+ case "time":
+ if (propElement.hasAtribute("datetime")) {
+ return propElement.getAttribute("datetime");
+ }
+ return propElement.textContent;
+ default:
+ // Not mentioned in the spec but sites seem to use it.
+ if (propElement.hasAttribute("content")) {
+ return propElement.getAttribute("content");
+ }
+ return propElement.textContent;
+ }
+}
+
+/**
+ * Collects product data from an item.
+ *
+ * @param {Document} document
+ * The document the item comes from.
+ * @param {PageData} pageData
+ * The pageData object to add to.
+ * @param {Item} item
+ * The product item.
+ */
+function collectProduct(document, pageData, item) {
+ if (item.has("image")) {
+ let url = new URL(item.get("image"), document.documentURI);
+ pageData.image = url.toString();
+ }
+
+ if (item.has("description")) {
+ pageData.description = item.get("description");
+ }
+
+ pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = {
+ name: item.get("name"),
+ };
+
+ for (let offer of item.all("offers")) {
+ if (!(offer instanceof Item) || offer.type != "Offer") {
+ continue;
+ }
+
+ let price = parseFloat(offer.get("price"));
+ if (!isNaN(price)) {
+ pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = {
+ value: price,
+ currency: offer.get("priceCurrency"),
+ };
+
+ break;
+ }
+ }
+}
+
+/**
+ * Returns the root microdata items from the given document.
+ *
+ * @param {Document} document
+ * The DOM document to collect from.
+ * @returns {Item[]}
+ */
+function collectMicrodataItems(document) {
+ // First find all of the items in the document.
+ let itemElements = document.querySelectorAll(
+ "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
+ );
+
+ /**
+ * Maps elements to the closest item.
+ *
+ * @type {Map<Element, Item>}
+ */
+ let items = new Map();
+
+ /**
+ * Finds the item for an element. Throws if there is no item. Caches the
+ * result.
+ *
+ * @param {Element} element
+ * The element to search from.
+ * @returns {Item}
+ */
+ function itemFor(element) {
+ let item = items.get(element);
+ if (item) {
+ return item;
+ }
+
+ if (!element.parentElement) {
+ throw new Error("Element has no parent item.");
+ }
+
+ item = itemFor(element.parentElement);
+ items.set(element, item);
+ return item;
+ }
+
+ for (let element of itemElements) {
+ let itemType = element.getAttribute("itemtype");
+ // Strip off the base url
+ if (itemType.startsWith("https://")) {
+ itemType = itemType.substring(19);
+ } else {
+ itemType = itemType.substring(18);
+ }
+
+ items.set(element, new Item(itemType));
+ }
+
+ // The initial roots are just all the items.
+ let roots = new Set(items.values());
+
+ // Now find all item properties.
+ let itemProps = document.querySelectorAll(
+ "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]"
+ );
+
+ for (let element of itemProps) {
+ // The item is always defined above the current element.
+ let item = itemFor(element.parentElement);
+
+ // The properties value is either a nested item or a simple value.
+ let propValue = items.get(element) ?? parseMicrodataProp(element);
+ item.set(element.getAttribute("itemprop"), propValue);
+
+ if (propValue instanceof Item) {
+ // This item belongs to another item and so is not a root item.
+ roots.delete(propValue);
+ }
+ }
+
+ return [...roots];
+}
+
+/**
+ * Returns the root JSON-LD items from the given document.
+ *
+ * @param {Document} document
+ * The DOM document to collect from.
+ * @returns {Item[]}
+ */
+function collectJsonLDItems(document) {
+ /**
+ * The root items.
+ *
+ * @type {Item[]}
+ */
+ let items = [];
+
+ /**
+ * Converts a JSON-LD value into an Item if appropriate.
+ *
+ * @param {any} val
+ * The value to convert.
+ * @returns {any}
+ */
+ function fromLD(val) {
+ if (typeof val == "object" && "@type" in val) {
+ let item = new Item(val["@type"]);
+
+ for (let [prop, value] of Object.entries(val)) {
+ // Ignore meta properties.
+ if (prop.startsWith("@")) {
+ continue;
+ }
+
+ if (!Array.isArray(value)) {
+ value = [value];
+ }
+
+ item.properties.set(prop, value.map(fromLD));
+ }
+
+ return item;
+ }
+
+ return val;
+ }
+
+ let scripts = document.querySelectorAll("script[type='application/ld+json'");
+ for (let script of scripts) {
+ try {
+ let content = JSON.parse(script.textContent);
+
+ if (typeof content != "object") {
+ continue;
+ }
+
+ if (!("@context" in content)) {
+ continue;
+ }
+
+ if (
+ content["@context"] != "http://schema.org" &&
+ content["@context"] != "https://schema.org"
+ ) {
+ continue;
+ }
+
+ let item = fromLD(content);
+ if (item instanceof Item) {
+ items.push(item);
+ }
+ } catch (e) {
+ // Unparsable content.
+ }
+ }
+
+ return items;
+}
+
+/**
+ * Collects schema.org related data from a page.
+ *
+ * Currently only supports HTML Microdata and JSON-LD formats, not RDFa.
+ */
+export const SchemaOrgPageData = {
+ /**
+ * Parses and collects the schema.org items from the given document.
+ * The returned items are the roots, i.e. the top-level items, there may be
+ * other items as nested properties.
+ *
+ * @param {Document} document
+ * The DOM document to parse.
+ * @returns {Item[]}
+ */
+ collectItems(document) {
+ return collectMicrodataItems(document).concat(collectJsonLDItems(document));
+ },
+
+ /**
+ * Performs PageData collection from the given document.
+ *
+ * @param {Document} document
+ * The DOM document to collect from.
+ * @returns {PageData}
+ */
+ collect(document) {
+ let pageData = { data: {} };
+
+ let items = this.collectItems(document);
+
+ for (let item of items) {
+ switch (item.type) {
+ case "Product":
+ if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) {
+ collectProduct(document, pageData, item);
+ }
+ break;
+ case "Organization":
+ pageData.siteName = item.get("name");
+ break;
+ }
+ }
+
+ return pageData;
+ },
+};
diff --git a/browser/components/pagedata/TwitterPageData.sys.mjs b/browser/components/pagedata/TwitterPageData.sys.mjs
new file mode 100644
index 0000000000..88b06098cb
--- /dev/null
+++ b/browser/components/pagedata/TwitterPageData.sys.mjs
@@ -0,0 +1,42 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Collects Twitter card (https://developer.twitter.com/en/docs/twitter-for-websites/)
+ * related data from a page.
+ */
+export const TwitterPageData = {
+ /**
+ * Collects the twitter data from the page.
+ *
+ * @param {Document} document
+ * The document to collect from
+ *
+ * @returns {PageData}
+ */
+ collect(document) {
+ let pageData = {};
+
+ let twitterTags = document.querySelectorAll("meta[name^='twitter:'");
+
+ for (let tag of twitterTags) {
+ // Strip "twitter:" from the property name.
+ let propertyName = tag.getAttribute("name").substring(8);
+
+ switch (propertyName) {
+ case "site":
+ pageData.siteName = tag.getAttribute("content");
+ break;
+ case "description":
+ pageData.description = tag.getAttribute("content");
+ break;
+ case "image":
+ pageData.image = tag.getAttribute("content");
+ break;
+ }
+ }
+
+ return pageData;
+ },
+};
diff --git a/browser/components/pagedata/docs/index.md b/browser/components/pagedata/docs/index.md
new file mode 100644
index 0000000000..47b507d13a
--- /dev/null
+++ b/browser/components/pagedata/docs/index.md
@@ -0,0 +1,50 @@
+# PageDataService
+
+The page data service is responsible for collecting additional data about a page. This could include
+information about the media on a page, product information, etc. When enabled it will automatically
+try to find page data for pages that the user browses or it can be directed to asynchronously look
+up the page data for a url.
+
+The `PageDataService` is an EventEmitter and listeners can subscribe to its notifications via the
+`on` and `once` methods.
+
+The service can be enabled by setting `browser.pagedata.enabled` to true. Additional logging can be
+enabled by setting `browser.pagedata.log` to true.
+
+## PageData Data Structure
+
+At a high level the page data service can collect many different kinds of data. When queried the
+service will respond with a `PageData` structure which holds some general information about the
+page, the time when the data was discovered and a map of the different types of data found. This map
+will be empty if no specific data was found. The key of the map is from the
+`PageDataSchema.DATA_TYPE` enumeration. The value is the JSON data which differs in structure
+depending on the data type.
+
+```
+{
+ "url": <url of the page as a string>,
+ "date": <epoch based timestamp>,
+ "siteName": <a friendly name for the website>,
+ "image": <url for an image for the page as a string>,
+ "data": <map of data types>,
+}
+```
+
+## PageData Collection
+
+Page data is gathered in one of two ways.
+
+Page data is automatically gathered for webpages the user visits. This collection is trigged after
+a short delay and then updated when necessary. Any data is cached in memory for a period of time.
+When page data has been found a `page-data` event is emitted. The event's argument holds the
+`PageData` structure. The `getCached` function can be used to access any cached data for a url.
+
+## Supported Types of page data
+
+The following types of page data (`PageDataSchema.DATA_TYPE`) are currently supported:
+
+- `PRODUCT`
+- `DOCUMENT`
+- `ARTICLE`
+- `AUDIO`
+- `VIDEO`
diff --git a/browser/components/pagedata/jar.mn b/browser/components/pagedata/jar.mn
new file mode 100644
index 0000000000..19860a30ee
--- /dev/null
+++ b/browser/components/pagedata/jar.mn
@@ -0,0 +1,6 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+browser.jar:
+ content/browser/pagedata/schemas/ (schemas/*.json)
diff --git a/browser/components/pagedata/moz.build b/browser/components/pagedata/moz.build
new file mode 100644
index 0000000000..0d733ca309
--- /dev/null
+++ b/browser/components/pagedata/moz.build
@@ -0,0 +1,29 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+XPCSHELL_TESTS_MANIFESTS += [
+ "tests/unit/xpcshell.ini",
+]
+BROWSER_CHROME_MANIFESTS += [
+ "tests/browser/browser.ini",
+]
+
+JAR_MANIFESTS += ["jar.mn"]
+
+EXTRA_JS_MODULES.pagedata += [
+ "OpenGraphPageData.sys.mjs",
+ "PageDataSchema.sys.mjs",
+ "PageDataService.sys.mjs",
+ "SchemaOrgPageData.sys.mjs",
+ "TwitterPageData.sys.mjs",
+]
+
+FINAL_TARGET_FILES.actors += [
+ "PageDataChild.sys.mjs",
+ "PageDataParent.sys.mjs",
+]
+
+SPHINX_TREES["docs"] = "docs"
diff --git a/browser/components/pagedata/schemas/article.schema.json b/browser/components/pagedata/schemas/article.schema.json
new file mode 100644
index 0000000000..e02bb11655
--- /dev/null
+++ b/browser/components/pagedata/schemas/article.schema.json
@@ -0,0 +1,26 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "article.schema.json",
+ "title": "Article",
+ "description": "An article for reading",
+ "type": "object",
+ "properties": {
+ "name": {
+ "description": "The article's name",
+ "type": "string"
+ },
+ "author": {
+ "description": "The author(s) of the article",
+ "type": "string"
+ },
+ "date": {
+ "description": "The date the article was published in ISO-8601 date or date/time format",
+ "type": "string"
+ },
+ "readingTime": {
+ "description": "The expected time to read the article in seconds",
+ "type": "number"
+ }
+ },
+ "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/audio.schema.json b/browser/components/pagedata/schemas/audio.schema.json
new file mode 100644
index 0000000000..db1b79b55c
--- /dev/null
+++ b/browser/components/pagedata/schemas/audio.schema.json
@@ -0,0 +1,34 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "audio.schema.json",
+ "title": "Audio",
+ "description": "An audio file",
+ "type": "object",
+ "properties": {
+ "name": {
+ "description": "The audio's name",
+ "type": "string"
+ },
+ "duration": {
+ "description": "The audio's duration in seconds",
+ "type": "number"
+ },
+ "artist": {
+ "description": "The artist who created the audio",
+ "type": "string"
+ },
+ "album": {
+ "description": "For music on an album the name of the album",
+ "type": "string"
+ },
+ "track": {
+ "description": "For music on an album the number of the track on the album",
+ "type": "number"
+ },
+ "genre": {
+ "description": "The genre of the audio",
+ "type": "string"
+ }
+ },
+ "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/document.schema.json b/browser/components/pagedata/schemas/document.schema.json
new file mode 100644
index 0000000000..849010773b
--- /dev/null
+++ b/browser/components/pagedata/schemas/document.schema.json
@@ -0,0 +1,18 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "document.schema.json",
+ "title": "Document",
+ "description": "A document of some kind, either viewable or editable",
+ "type": "object",
+ "properties": {
+ "name": {
+ "description": "The document's name",
+ "type": "string"
+ },
+ "mimeType": {
+ "description": "The document's mimetype",
+ "type": "string"
+ }
+ },
+ "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/general.schema.json b/browser/components/pagedata/schemas/general.schema.json
new file mode 100644
index 0000000000..a400fd889b
--- /dev/null
+++ b/browser/components/pagedata/schemas/general.schema.json
@@ -0,0 +1,30 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "general.schema.json",
+ "title": "General",
+ "description": "General data about a page",
+ "type": "object",
+ "properties": {
+ "url": {
+ "description": "The page's url",
+ "type": "string"
+ },
+ "date": {
+ "description": "The date the data was collected as a timestamp",
+ "type": "number"
+ },
+ "description": {
+ "description": "A description of the page",
+ "type": "string"
+ },
+ "siteName": {
+ "description": "A friendly name for the site",
+ "type": "string"
+ },
+ "image": {
+ "description": "The url for an image representative of the page",
+ "type": "string"
+ }
+ },
+ "required": ["url", "date"]
+}
diff --git a/browser/components/pagedata/schemas/product.schema.json b/browser/components/pagedata/schemas/product.schema.json
new file mode 100644
index 0000000000..77bec76ff2
--- /dev/null
+++ b/browser/components/pagedata/schemas/product.schema.json
@@ -0,0 +1,46 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "product.schema.json",
+ "title": "Product",
+ "description": "A product that can be purchased",
+ "type": "object",
+ "properties": {
+ "name": {
+ "description": "The product's name",
+ "type": "string"
+ },
+ "brand": {
+ "description": "The product's brand",
+ "type": "string"
+ },
+ "price": {
+ "description": "The cost of a single unit",
+ "type": "object",
+ "properties": {
+ "value": {
+ "type": "number"
+ },
+ "currency": {
+ "description": "The currency for the value",
+ "type": "string"
+ }
+ },
+ "required": ["value"]
+ },
+ "shippingCost": {
+ "description": "The cost of shipping",
+ "type": "object",
+ "properties": {
+ "value": {
+ "type": "number"
+ },
+ "currency": {
+ "description": "The currency for the value",
+ "type": "string"
+ }
+ },
+ "required": ["value"]
+ }
+ },
+ "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/video.schema.json b/browser/components/pagedata/schemas/video.schema.json
new file mode 100644
index 0000000000..1091ebfe89
--- /dev/null
+++ b/browser/components/pagedata/schemas/video.schema.json
@@ -0,0 +1,38 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "video.schema.json",
+ "title": "Video",
+ "description": "A video",
+ "type": "object",
+ "properties": {
+ "name": {
+ "description": "The video's name",
+ "type": "string"
+ },
+ "duration": {
+ "description": "The video's duration in seconds",
+ "type": "number"
+ },
+ "quality": {
+ "description": "A short description of the video's quality (e.g. 'HD', '720p')",
+ "type": "string"
+ },
+ "show": {
+ "description": "For an episode of a TV show the name of the TV show",
+ "type": "string"
+ },
+ "season": {
+ "description": "For an episode of a TV show the season number it appears in",
+ "type": "number"
+ },
+ "episode": {
+ "description": "For an episode of a TV show the number of the episode in the season",
+ "type": "number"
+ },
+ "genre": {
+ "description": "The genre of the video",
+ "type": "string"
+ }
+ },
+ "required": ["name"]
+}
diff --git a/browser/components/pagedata/tests/browser/browser.ini b/browser/components/pagedata/tests/browser/browser.ini
new file mode 100644
index 0000000000..f07d43fc06
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser.ini
@@ -0,0 +1,14 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+[DEFAULT]
+prefs =
+ browser.pagedata.log=true
+ browser.pagedata.enabled=true
+support-files =
+ head.js
+
+[browser_pagedata_background.js]
+[browser_pagedata_basic.js]
+[browser_pagedata_cache.js]
diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_background.js b/browser/components/pagedata/tests/browser/browser_pagedata_background.js
new file mode 100644
index 0000000000..bba2ae2e47
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_background.js
@@ -0,0 +1,48 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Background load tests for the page data service.
+ */
+
+const TEST_URL =
+ "data:text/html," +
+ encodeURIComponent(`
+ <html>
+ <head>
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:site" content="@nytimes">
+ <meta name="twitter:creator" content="@SarahMaslinNir">
+ <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral">
+ <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines">
+ <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg">
+ </head>
+ <body>
+ </body>
+ </html>
+`);
+
+add_task(async function test_pagedata_no_data() {
+ let pageData = await PageDataService.fetchPageData(TEST_URL);
+
+ delete pageData.date;
+ Assert.deepEqual(
+ pageData,
+ {
+ url: TEST_URL,
+ siteName: "@nytimes",
+ description: "NEWARK - The guest list and parade of limousines",
+ image:
+ "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+ data: {},
+ },
+ "Should have returned the right data"
+ );
+
+ Assert.equal(
+ PageDataService.getCached(TEST_URL),
+ null,
+ "Should not have cached this data"
+ );
+});
diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_basic.js b/browser/components/pagedata/tests/browser/browser_pagedata_basic.js
new file mode 100644
index 0000000000..aac34014ee
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_basic.js
@@ -0,0 +1,64 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Basic tests for the page data service.
+ */
+
+const TEST_URL = "https://example.com/";
+const TEST_URL2 = "https://example.com/browser";
+
+add_task(async function test_pagedata_no_data() {
+ let promise = PageDataService.once("page-data");
+
+ await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
+ let pageData = await promise;
+ Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL");
+ Assert.deepEqual(pageData.data, {}, "Should have returned no data");
+ Assert.deepEqual(
+ PageDataService.getCached(TEST_URL),
+ pageData,
+ "Should return the same data from the cache"
+ );
+
+ promise = PageDataService.once("page-data");
+ BrowserTestUtils.loadURIString(browser, TEST_URL2);
+ await BrowserTestUtils.browserLoaded(browser, false, TEST_URL2);
+ pageData = await promise;
+ Assert.equal(
+ pageData.url,
+ TEST_URL2,
+ "Should have returned the loaded URL"
+ );
+ Assert.deepEqual(pageData.data, {}, "Should have returned no data");
+ Assert.deepEqual(
+ PageDataService.getCached(TEST_URL2),
+ pageData,
+ "Should return the same data from the cache"
+ );
+
+ info("Test going back still triggers collection");
+
+ promise = PageDataService.once("page-data");
+ let locationChangePromise = BrowserTestUtils.waitForLocationChange(
+ gBrowser,
+ TEST_URL
+ );
+ browser.goBack();
+ await locationChangePromise;
+ pageData = await promise;
+
+ Assert.equal(
+ pageData.url,
+ TEST_URL,
+ "Should have returned the URL of the previous page"
+ );
+ Assert.deepEqual(pageData.data, {}, "Should have returned no data");
+ Assert.deepEqual(
+ PageDataService.getCached(TEST_URL),
+ pageData,
+ "Should return the same data from the cache"
+ );
+ });
+});
diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_cache.js b/browser/components/pagedata/tests/browser/browser_pagedata_cache.js
new file mode 100644
index 0000000000..e41b4ea2f8
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_cache.js
@@ -0,0 +1,155 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests for the page data cache.
+ */
+
+const TEST_URL =
+ "data:text/html," +
+ encodeURIComponent(`
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="utf-8">
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:site" content="@nytimes">
+ <meta name="twitter:creator" content="@SarahMaslinNir">
+ <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral">
+ <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines">
+ <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg">
+ </head>
+ <body>
+ </body>
+ </html>
+`);
+
+/**
+ * Runs a task with a new page loaded into a tab in a new browser window.
+ *
+ * @param {string} url
+ * The url to load.
+ * @param {Function} task
+ * The task to run. May return a promise.
+ */
+async function withBrowserInNewWindow(url, task) {
+ let newWin = await BrowserTestUtils.openNewBrowserWindow();
+ let tab = await BrowserTestUtils.openNewForegroundTab(newWin.gBrowser, url);
+ await task(tab.linkedBrowser);
+ await BrowserTestUtils.closeWindow(newWin);
+}
+
+add_task(async function test_pagedata_cache() {
+ let promise = PageDataService.once("page-data");
+
+ Assert.equal(
+ PageDataService.getCached(TEST_URL),
+ null,
+ "Should be no data cached."
+ );
+
+ await BrowserTestUtils.withNewTab(TEST_URL, async () => {
+ let pageData = await promise;
+
+ Assert.deepEqual(
+ PageDataService.getCached(TEST_URL),
+ pageData,
+ "Should return the same data from the cache"
+ );
+
+ delete pageData.date;
+
+ Assert.deepEqual(
+ pageData,
+ {
+ url: TEST_URL,
+ siteName: "@nytimes",
+ description: "NEWARK - The guest list and parade of limousines",
+ image:
+ "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+ data: {},
+ },
+ "Should have returned the right data"
+ );
+ });
+
+ Assert.equal(
+ PageDataService.getCached(TEST_URL),
+ null,
+ "Data should no longer be cached."
+ );
+
+ promise = PageDataService.once("page-data");
+
+ // Checks that closing a window containing a tracked tab stops tracking the tab.
+ await withBrowserInNewWindow(TEST_URL, async () => {
+ let pageData = await promise;
+
+ Assert.deepEqual(
+ PageDataService.getCached(TEST_URL),
+ pageData,
+ "Should return the same data from the cache"
+ );
+
+ delete pageData.date;
+ Assert.deepEqual(
+ pageData,
+ {
+ url: TEST_URL,
+ siteName: "@nytimes",
+ description: "NEWARK - The guest list and parade of limousines",
+ image:
+ "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+ data: {},
+ },
+ "Should have returned the right data"
+ );
+ });
+
+ Assert.equal(
+ PageDataService.getCached(TEST_URL),
+ null,
+ "Data should no longer be cached."
+ );
+
+ let actor = {};
+ PageDataService.lockEntry(actor, TEST_URL);
+
+ promise = PageDataService.once("page-data");
+
+ // Closing a tracked tab shouldn't expire the data here as we have another lock.
+ await BrowserTestUtils.withNewTab(TEST_URL, async () => {
+ await promise;
+ });
+
+ promise = PageDataService.once("page-data");
+
+ // Closing a window with a tracked tab shouldn't expire the data here as we have another lock.
+ await withBrowserInNewWindow(TEST_URL, async () => {
+ await promise;
+ });
+
+ let cached = PageDataService.getCached(TEST_URL);
+ delete cached.date;
+ Assert.deepEqual(
+ cached,
+ {
+ url: TEST_URL,
+ siteName: "@nytimes",
+ description: "NEWARK - The guest list and parade of limousines",
+ image:
+ "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+ data: {},
+ },
+ "Entry should still be cached"
+ );
+
+ PageDataService.unlockEntry(actor, TEST_URL);
+
+ Assert.equal(
+ PageDataService.getCached(TEST_URL),
+ null,
+ "Data should no longer be cached."
+ );
+});
diff --git a/browser/components/pagedata/tests/browser/head.js b/browser/components/pagedata/tests/browser/head.js
new file mode 100644
index 0000000000..b4f57cdb76
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/head.js
@@ -0,0 +1,8 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+ChromeUtils.defineESModuleGetters(this, {
+ PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs",
+ PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+});
diff --git a/browser/components/pagedata/tests/unit/head.js b/browser/components/pagedata/tests/unit/head.js
new file mode 100644
index 0000000000..48ae246f6a
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/head.js
@@ -0,0 +1,103 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+const { XPCOMUtils } = ChromeUtils.importESModule(
+ "resource://gre/modules/XPCOMUtils.sys.mjs"
+);
+
+ChromeUtils.defineESModuleGetters(this, {
+ PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs",
+});
+
+const { HttpServer } = ChromeUtils.import("resource://testing-common/httpd.js");
+
+const server = new HttpServer();
+server.start(-1);
+
+const SERVER_PORT = server.identity.primaryPort;
+const BASE_URL = "http://localhost:" + SERVER_PORT;
+const DEFAULT_PATH = "/document.html";
+const TEST_URL = BASE_URL + DEFAULT_PATH;
+
+registerCleanupFunction(() => {
+ server.stop();
+});
+
+do_get_profile();
+Services.prefs.setBoolPref("browser.pagedata.log", true);
+
+/**
+ * Given a string parses it as HTML into a DOM Document object.
+ *
+ * @param {string} str
+ * The string to parse.
+ * @param {string} path
+ * The path for the document on the server, defaults to "/document.html"
+ * @returns {Promise<Document>} the HTML DOM Document object.
+ */
+function parseDocument(str, path = DEFAULT_PATH) {
+ server.registerPathHandler(path, (request, response) => {
+ response.setHeader("Content-Type", "text/html;charset=utf-8");
+
+ let converter = Cc[
+ "@mozilla.org/intl/converter-output-stream;1"
+ ].createInstance(Ci.nsIConverterOutputStream);
+ converter.init(response.bodyOutputStream, "utf-8");
+ converter.writeString(str);
+ });
+
+ return new Promise((resolve, reject) => {
+ let request = new XMLHttpRequest();
+ request.responseType = "document";
+ request.open("GET", BASE_URL + path, true);
+
+ request.addEventListener("error", reject);
+ request.addEventListener("abort", reject);
+
+ request.addEventListener("load", function () {
+ resolve(request.responseXML);
+ });
+
+ request.send();
+ });
+}
+
+/**
+ * Parses page data from a HTML string.
+ *
+ * @param {string} str
+ * The HTML string to parse.
+ * @param {string} path
+ * The path for the document on the server, defaults to "/document.html"
+ * @returns {Promise<PageData>} A promise that resolves to the page data found.
+ */
+async function parsePageData(str, path) {
+ let doc = await parseDocument(str, path);
+ return PageDataSchema.collectPageData(doc);
+}
+
+/**
+ * Verifies that the HTML string given parses to the expected page data.
+ *
+ * @param {string} str
+ * The HTML string to parse.
+ * @param {PageData} expected
+ * The expected pagedata excluding the date and url properties.
+ * @param {string} path
+ * The path for the document on the server, defaults to "/document.html"
+ * @returns {Promise<PageData>} A promise that resolves to the page data found.
+ */
+async function verifyPageData(str, expected, path = DEFAULT_PATH) {
+ let pageData = await parsePageData(str, path);
+
+ delete pageData.date;
+
+ Assert.equal(pageData.url, BASE_URL + path);
+ delete pageData.url;
+
+ Assert.deepEqual(
+ pageData,
+ expected,
+ "Should have seen the expected page data."
+ );
+}
diff --git a/browser/components/pagedata/tests/unit/test_opengraph.js b/browser/components/pagedata/tests/unit/test_opengraph.js
new file mode 100644
index 0000000000..e5accaf675
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_opengraph.js
@@ -0,0 +1,67 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse Open Graph metadata.
+ */
+
+add_task(async function test_type_website() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Internet for people, not profit — Mozilla</title>
+ <meta property="og:type" content="website">
+ <meta property="og:site_name" content="Mozilla">
+ <meta property="og:url" content="https://www.mozilla.org/">
+ <meta property="og:image" content="https://example.com/preview-image">
+ <meta property="og:title" content="Internet for people, not profit">
+ <!-- We expect the test will ignore tags the parser does not recognize. -->
+ <meta property="og:locale" content="en_CA">
+ <meta property="og:description" content="Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.">
+ </head>
+ <body>
+ <p>Test page</p>
+ </body>
+ </html>
+ `,
+ {
+ siteName: "Mozilla",
+ description:
+ "Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.",
+ image: "https://example.com/preview-image",
+ data: {},
+ }
+ );
+});
+
+add_task(async function test_type_movie() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Code Rush (TV Movie 2000)</title>
+ <meta property="og:url" content="https://www.imdb.com/title/tt0499004/"/>
+ <!-- Omitting og:site_name to test that the parser doesn't break on missing tags. -->
+ <meta property="og:title" content="Code Rush (TV Movie 2000) - IMDb"/>
+ <meta property="og:description" content="This is the description of the movie."/>
+ <meta property="og:type" content="video.movie"/>
+ <meta property="og:image" content="https://example.com/preview-code-rush"/>
+ <meta property="og:image:height" content="750"/>
+ <meta property="og:image:width" content="1000"/>
+ </head>
+ <body>
+ <p>Test page</p>
+ </body>
+ </html>
+ `,
+ {
+ image: "https://example.com/preview-code-rush",
+ description: "This is the description of the movie.",
+ data: {},
+ }
+ );
+});
diff --git a/browser/components/pagedata/tests/unit/test_pagedata_basic.js b/browser/components/pagedata/tests/unit/test_pagedata_basic.js
new file mode 100644
index 0000000000..5d31645a4c
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_pagedata_basic.js
@@ -0,0 +1,100 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+/*
+ * Simply tests that the notification is dispatched when new page data is
+ * discovered.
+ */
+
+ChromeUtils.defineESModuleGetters(this, {
+ PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+});
+
+add_task(async function test_pageDataDiscovered_notifies() {
+ let url = "https://www.mozilla.org/";
+
+ Assert.equal(
+ PageDataService.getCached(url),
+ null,
+ "Should be no cached data."
+ );
+
+ let promise = PageDataService.once("page-data");
+
+ PageDataService.pageDataDiscovered({
+ url,
+ date: 32453456,
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: { value: 276 },
+ },
+ },
+ });
+
+ let pageData = await promise;
+ Assert.equal(
+ pageData.url,
+ url,
+ "Should have notified data for the expected url"
+ );
+
+ Assert.deepEqual(
+ pageData,
+ {
+ url,
+ date: 32453456,
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: { value: 276 },
+ },
+ },
+ },
+ "Should have returned the correct product data"
+ );
+
+ Assert.equal(
+ PageDataService.getCached(url),
+ null,
+ "Should not have cached the data as there was no actor locking."
+ );
+
+ let actor = {};
+ PageDataService.lockEntry(actor, url);
+
+ PageDataService.pageDataDiscovered({
+ url,
+ date: 32453456,
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: { value: 276 },
+ },
+ },
+ });
+
+ // Should now be in the cache.
+ Assert.deepEqual(
+ PageDataService.getCached(url),
+ {
+ url,
+ date: 32453456,
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: { value: 276 },
+ },
+ },
+ },
+ "Should have cached the data"
+ );
+
+ PageDataService.unlockEntry(actor, url);
+
+ Assert.equal(
+ PageDataService.getCached(url),
+ null,
+ "Should have dropped the data from the cache."
+ );
+});
diff --git a/browser/components/pagedata/tests/unit/test_pagedata_schema.js b/browser/components/pagedata/tests/unit/test_pagedata_schema.js
new file mode 100644
index 0000000000..fcd9c4b297
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_pagedata_schema.js
@@ -0,0 +1,210 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+/*
+ * Tests schema validation.
+ */
+
+add_task(async function testBasic() {
+ // Old data types, should not be recognised.
+ Assert.equal(PageDataSchema.nameForType(1), null);
+ Assert.equal(PageDataSchema.nameForType(2), null);
+
+ Assert.equal(
+ PageDataSchema.nameForType(PageDataSchema.DATA_TYPE.VIDEO),
+ "VIDEO"
+ );
+ Assert.equal(
+ PageDataSchema.nameForType(PageDataSchema.DATA_TYPE.PRODUCT),
+ "PRODUCT"
+ );
+});
+
+add_task(async function testProduct() {
+ // Products must have a name
+ await Assert.rejects(
+ PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {}),
+ /missing required property 'name'/
+ );
+
+ await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+ name: "Bolts",
+ });
+
+ await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+ name: "Bolts",
+ price: {
+ value: 5,
+ },
+ });
+
+ await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+ name: "Bolts",
+ price: {
+ value: 5,
+ currency: "USD",
+ },
+ });
+
+ await Assert.rejects(
+ PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+ name: "Bolts",
+ price: {
+ currency: "USD",
+ },
+ }),
+ /missing required property 'value'/
+ );
+
+ await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+ name: "Bolts",
+ shippingCost: {
+ value: 5,
+ currency: "USD",
+ },
+ });
+
+ await Assert.rejects(
+ PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+ name: "Bolts",
+ shippingCost: {
+ currency: "USD",
+ },
+ }),
+ /missing required property 'value'/
+ );
+});
+
+add_task(async function testCoalesce() {
+ let joined = PageDataSchema.coalescePageData({}, {});
+ Assert.deepEqual(joined, { data: {} });
+
+ joined = PageDataSchema.coalescePageData(
+ {
+ url: "https://www.google.com/",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "bolts",
+ },
+ [PageDataSchema.DATA_TYPE.VIDEO]: {
+ name: "My video",
+ duration: 500,
+ },
+ },
+ },
+ {
+ url: "https://www.mozilla.com/",
+ date: 27,
+ siteName: "Mozilla",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "newname",
+ price: {
+ value: 55,
+ },
+ },
+ [PageDataSchema.DATA_TYPE.AUDIO]: {
+ name: "My song",
+ },
+ },
+ }
+ );
+
+ Assert.deepEqual(joined, {
+ url: "https://www.google.com/",
+ date: 27,
+ siteName: "Mozilla",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "bolts",
+ price: {
+ value: 55,
+ },
+ },
+ [PageDataSchema.DATA_TYPE.VIDEO]: {
+ name: "My video",
+ duration: 500,
+ },
+ [PageDataSchema.DATA_TYPE.AUDIO]: {
+ name: "My song",
+ },
+ },
+ });
+});
+
+add_task(async function testPageData() {
+ // Full page data needs a url and a date
+ await Assert.rejects(
+ PageDataSchema.validatePageData({}),
+ /missing required property 'url'/
+ );
+
+ await Assert.rejects(
+ PageDataSchema.validatePageData({ url: "https://www.google.com" }),
+ /missing required property 'date'/
+ );
+
+ await Assert.rejects(
+ PageDataSchema.validatePageData({ date: 55 }),
+ /missing required property 'url'/
+ );
+
+ Assert.deepEqual(
+ await PageDataSchema.validatePageData({
+ url: "https://www.google.com",
+ date: 55,
+ }),
+ { url: "https://www.google.com", date: 55, data: {} }
+ );
+
+ Assert.deepEqual(
+ await PageDataSchema.validatePageData({
+ url: "https://www.google.com",
+ date: 55,
+ data: {
+ 0: {
+ name: "unknown",
+ },
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: {
+ value: 55,
+ },
+ },
+ },
+ }),
+ {
+ url: "https://www.google.com",
+ date: 55,
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: {
+ value: 55,
+ },
+ },
+ },
+ }
+ );
+
+ // Should drop invalid inner data.
+ Assert.deepEqual(
+ await PageDataSchema.validatePageData({
+ url: "https://www.google.com",
+ date: 55,
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bolts",
+ price: {
+ currency: "USD",
+ },
+ },
+ },
+ }),
+ {
+ url: "https://www.google.com",
+ date: 55,
+ data: {},
+ }
+ );
+});
diff --git a/browser/components/pagedata/tests/unit/test_queue.js b/browser/components/pagedata/tests/unit/test_queue.js
new file mode 100644
index 0000000000..3d180edd13
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_queue.js
@@ -0,0 +1,527 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+ChromeUtils.defineESModuleGetters(this, {
+ PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+ PromiseUtils: "resource://gre/modules/PromiseUtils.sys.mjs",
+});
+
+// Test that urls are retrieved in the expected order.
+add_task(async function test_queueOrder() {
+ Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 0);
+ // Pretend we are idle.
+ PageDataService.observe(null, "idle", null);
+
+ let pageDataResults = [
+ {
+ date: Date.now(),
+ url: "http://www.mozilla.org/1",
+ siteName: "Mozilla",
+ data: {},
+ },
+ {
+ date: Date.now() - 3600,
+ url: "http://www.google.com/2",
+ siteName: "Google",
+ data: {},
+ },
+ {
+ date: Date.now() + 3600,
+ url: "http://www.example.com/3",
+ image: "http://www.example.com/banner.jpg",
+ data: {},
+ },
+ {
+ date: Date.now() / 2,
+ url: "http://www.wikipedia.org/4",
+ data: {},
+ },
+ {
+ date: Date.now() / 3,
+ url: "http://www.microsoft.com/5",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Windows 11",
+ },
+ },
+ },
+ ];
+
+ let requests = [];
+ PageDataService.fetchPageData = url => {
+ requests.push(url);
+
+ for (let pageData of pageDataResults) {
+ if (pageData.url == url) {
+ return Promise.resolve(pageData);
+ }
+ }
+
+ return Promise.reject(new Error("Unknown url"));
+ };
+
+ let { promise: completePromise, resolve } = PromiseUtils.defer();
+
+ let results = [];
+ let listener = (_, pageData) => {
+ results.push(pageData);
+ if (results.length == pageDataResults.length) {
+ resolve();
+ }
+ };
+
+ PageDataService.on("page-data", listener);
+
+ for (let pageData of pageDataResults) {
+ PageDataService.queueFetch(pageData.url);
+ }
+
+ await completePromise;
+ PageDataService.off("page-data", listener);
+
+ Assert.deepEqual(
+ requests,
+ pageDataResults.map(pd => pd.url)
+ );
+
+ // Because our fetch implementation is essentially synchronous the results
+ // will be in a known order. This isn't guaranteed by the API though.
+ Assert.deepEqual(results, pageDataResults);
+
+ delete PageDataService.fetchPageData;
+});
+
+// Tests that limiting the number of fetches works.
+add_task(async function test_queueLimit() {
+ Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3);
+ // Pretend we are idle.
+ PageDataService.observe(null, "idle", null);
+
+ let requests = [];
+ PageDataService.fetchPageData = url => {
+ let { promise, resolve, reject } = PromiseUtils.defer();
+ requests.push({ url, resolve, reject });
+
+ return promise;
+ };
+
+ let results = [];
+ let listener = (_, pageData) => {
+ results.push(pageData?.url);
+ };
+
+ PageDataService.on("page-data", listener);
+
+ PageDataService.queueFetch("https://www.mozilla.org/1");
+ PageDataService.queueFetch("https://www.mozilla.org/2");
+ PageDataService.queueFetch("https://www.mozilla.org/3");
+ PageDataService.queueFetch("https://www.mozilla.org/4");
+ PageDataService.queueFetch("https://www.mozilla.org/5");
+ PageDataService.queueFetch("https://www.mozilla.org/6");
+ PageDataService.queueFetch("https://www.mozilla.org/7");
+ PageDataService.queueFetch("https://www.mozilla.org/8");
+ PageDataService.queueFetch("https://www.mozilla.org/9");
+ PageDataService.queueFetch("https://www.mozilla.org/10");
+ PageDataService.queueFetch("https://www.mozilla.org/11");
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ ]
+ );
+
+ // Completing or rejecting a request should start new ones.
+
+ requests[1].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/2",
+ siteName: "Test 2",
+ data: {},
+ });
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ ]
+ );
+
+ requests[3].reject(new Error("Fail"));
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ ]
+ );
+
+ // Increasing the limit should start more requests.
+ Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 5);
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ ]
+ );
+
+ // Dropping the limit shouldn't start anything new.
+ Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3);
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ ]
+ );
+
+ // But resolving should also not start new requests.
+ requests[5].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/6",
+ siteName: "Test 6",
+ data: {},
+ });
+
+ requests[0].resolve({
+ date: 343446434,
+ url: "https://www.mozilla.org/1",
+ siteName: "Test 1",
+ data: {},
+ });
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ ]
+ );
+
+ // Until a previous request completes.
+ requests[4].resolve(null);
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ "https://www.mozilla.org/8",
+ ]
+ );
+
+ // Inifinite queue should work.
+ Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 0);
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ "https://www.mozilla.org/8",
+ "https://www.mozilla.org/9",
+ "https://www.mozilla.org/10",
+ "https://www.mozilla.org/11",
+ ]
+ );
+
+ requests[10].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/11",
+ data: {},
+ });
+ requests[2].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/3",
+ data: {},
+ });
+ requests[7].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/8",
+ data: {},
+ });
+ requests[6].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/7",
+ data: {},
+ });
+ requests[8].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/9",
+ data: {},
+ });
+ requests[9].resolve({
+ date: 345334,
+ url: "https://www.mozilla.org/10",
+ data: {},
+ });
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ "https://www.mozilla.org/8",
+ "https://www.mozilla.org/9",
+ "https://www.mozilla.org/10",
+ "https://www.mozilla.org/11",
+ ]
+ );
+
+ PageDataService.off("page-data", listener);
+
+ delete PageDataService.fetchPageData;
+
+ Assert.deepEqual(results, [
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/11",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/8",
+ "https://www.mozilla.org/7",
+ "https://www.mozilla.org/9",
+ "https://www.mozilla.org/10",
+ ]);
+});
+
+// Tests that the user idle state stops and starts fetches.
+add_task(async function test_idle() {
+ Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3);
+ // Pretend we are active.
+ PageDataService.observe(null, "active", null);
+
+ let requests = [];
+ PageDataService.fetchPageData = url => {
+ let { promise, resolve, reject } = PromiseUtils.defer();
+ requests.push({ url, resolve, reject });
+
+ return promise;
+ };
+
+ let results = [];
+ let listener = (_, pageData) => {
+ results.push(pageData?.url);
+ };
+
+ PageDataService.on("page-data", listener);
+
+ PageDataService.queueFetch("https://www.mozilla.org/1");
+ PageDataService.queueFetch("https://www.mozilla.org/2");
+ PageDataService.queueFetch("https://www.mozilla.org/3");
+ PageDataService.queueFetch("https://www.mozilla.org/4");
+ PageDataService.queueFetch("https://www.mozilla.org/5");
+ PageDataService.queueFetch("https://www.mozilla.org/6");
+ PageDataService.queueFetch("https://www.mozilla.org/7");
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ // Nothing will start when active.
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ []
+ );
+
+ // Pretend we are idle.
+ PageDataService.observe(null, "idle", null);
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ ]
+ );
+
+ // Completing or rejecting a request should start new ones.
+
+ requests[1].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/2",
+ data: {},
+ });
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ ]
+ );
+
+ // But not when active
+ PageDataService.observe(null, "active", null);
+
+ requests[3].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/4",
+ data: {},
+ });
+ requests[0].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/1",
+ data: {},
+ });
+ requests[2].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/3",
+ data: {},
+ });
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ ]
+ );
+
+ // Going idle should start more workers
+ PageDataService.observe(null, "idle", null);
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ ]
+ );
+
+ requests[4].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/5",
+ data: {},
+ });
+ requests[5].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/6",
+ data: {},
+ });
+ requests[6].resolve({
+ date: 2345,
+ url: "https://www.mozilla.org/7",
+ data: {},
+ });
+
+ // Let a tick pass.
+ await Promise.resolve();
+
+ Assert.deepEqual(
+ requests.map(r => r.url),
+ [
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ ]
+ );
+
+ PageDataService.off("page-data", listener);
+
+ delete PageDataService.fetchPageData;
+
+ Assert.deepEqual(results, [
+ "https://www.mozilla.org/2",
+ "https://www.mozilla.org/4",
+ "https://www.mozilla.org/1",
+ "https://www.mozilla.org/3",
+ "https://www.mozilla.org/5",
+ "https://www.mozilla.org/6",
+ "https://www.mozilla.org/7",
+ ]);
+});
diff --git a/browser/components/pagedata/tests/unit/test_schemaorg.js b/browser/components/pagedata/tests/unit/test_schemaorg.js
new file mode 100644
index 0000000000..5470410e4f
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_schemaorg.js
@@ -0,0 +1,213 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse schema.org metadata into PageData.
+ */
+
+add_task(async function test_single_product_microdata() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Product Info 1</title>
+ </head>
+ <body>
+ <div itemscope itemtype="https://schema.org/Organization">
+ <div itemprop="employee" itemscope itemtype="https://schema.org/Person">
+ <span itemprop="name">Mr. Nested Name</span>
+ </div>
+
+ <span itemprop="name">Mozilla</span>
+ </div>
+
+ <div itemscope itemtype="https://schema.org/Product">
+ <img itemprop="image" src="bon-echo-microwave-17in.jpg" />
+ <a href="microwave.html" itemprop="url">
+ <span itemprop="name">Bon Echo Microwave</span>
+ </a>
+
+ <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
+ <span itemprop="price" content="3.50">£3.50</span>
+ <span itemprop="priceCurrency" content="GBP"></span>
+ </div>
+
+ <span itemprop="gtin" content="13572468"></span>
+
+ <span itemprop="description">The most amazing microwave in the world</span>
+ </div>
+ </body>
+ </html>
+ `,
+ {
+ siteName: "Mozilla",
+ description: "The most amazing microwave in the world",
+ image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bon Echo Microwave",
+ price: {
+ value: 3.5,
+ currency: "GBP",
+ },
+ },
+ },
+ }
+ );
+});
+
+add_task(async function test_single_product_json_ld() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <script type="application/ld+json">
+ {
+ "@context": "http://schema.org",
+ "@type": "Organization",
+ "employee": {
+ "@type": "Person",
+ "name": "Mr. Nested Name"
+ },
+ "name": "Mozilla"
+ }
+ </script>
+ <script type="application/ld+json">
+ {
+ "@context": "https://schema.org",
+ "@type": "Product",
+ "image": "bon-echo-microwave-17in.jpg",
+ "url": "microwave.html",
+ "name": "Bon Echo Microwave",
+ "offers": {
+ "@type": "Offer",
+ "price": "3.50",
+ "priceCurrency": "GBP"
+ },
+ "gtin": "13572468",
+ "description": "The most amazing microwave in the world"
+ }
+ </script>
+ </head>
+ <body>
+ </body>
+ </html>
+ `,
+ {
+ siteName: "Mozilla",
+ description: "The most amazing microwave in the world",
+ image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bon Echo Microwave",
+ price: {
+ value: 3.5,
+ currency: "GBP",
+ },
+ },
+ },
+ }
+ );
+});
+
+add_task(async function test_single_product_combined() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <script type="application/ld+json">
+ {
+ "@context": "https://schema.org",
+ "@type": "Product",
+ "image": "bon-echo-microwave-17in.jpg",
+ "url": "microwave.html",
+ "name": "Bon Echo Microwave",
+ "offers": {
+ "@type": "Offer",
+ "price": "3.50",
+ "priceCurrency": "GBP"
+ },
+ "gtin": "13572468",
+ "description": "The most amazing microwave in the world"
+ }
+ </script>
+ </head>
+ <body>
+ <div itemscope itemtype="https://schema.org/Organization">
+ <div itemprop="employee" itemscope itemtype="https://schema.org/Person">
+ <span itemprop="name">Mr. Nested Name</span>
+ </div>
+
+ <span itemprop="name">Mozilla</span>
+ </div>
+ </body>
+ </html>
+ `,
+ {
+ siteName: "Mozilla",
+ description: "The most amazing microwave in the world",
+ image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bon Echo Microwave",
+ price: {
+ value: 3.5,
+ currency: "GBP",
+ },
+ },
+ },
+ }
+ );
+});
+
+add_task(async function test_single_multiple_microdata() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Product Info 2</title>
+ </head>
+ <body>
+ <div itemscope itemtype="https://schema.org/Product">
+ <img itemprop="image" src="bon-echo-microwave-17in.jpg" />
+ <a href="microwave.html" itemprop="url">
+ <span itemprop="name">Bon Echo Microwave</span>
+ </a>
+
+ <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
+ <span itemprop="price" content="3.28">£3.28</span>
+ <span itemprop="priceCurrency" content="GBP"></span>
+ </div>
+
+ <span itemprop="gtin" content="13572468"></span>
+ </div>
+ <div itemscope itemtype="http://schema.org/Product">
+ <img itemprop="image" src="gran-paradiso-toaster-17in.jpg" />
+ <a href="toaster.html" itemprop="url">
+ <span itemprop="name">Gran Paradiso Toaster</span>
+ </a>
+
+ <span itemprop="gtin" content="15263748"></span>
+ </div>
+ </body>
+ </html>
+ `,
+ {
+ image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+ data: {
+ [PageDataSchema.DATA_TYPE.PRODUCT]: {
+ name: "Bon Echo Microwave",
+ price: {
+ value: 3.28,
+ currency: "GBP",
+ },
+ },
+ },
+ }
+ );
+});
diff --git a/browser/components/pagedata/tests/unit/test_schemaorg_parse.js b/browser/components/pagedata/tests/unit/test_schemaorg_parse.js
new file mode 100644
index 0000000000..e002598af2
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_schemaorg_parse.js
@@ -0,0 +1,193 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse schema.org metadata into Item
+ * structures.
+ */
+
+const { SchemaOrgPageData } = ChromeUtils.importESModule(
+ "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs"
+);
+
+/**
+ * Collects the schema.org items from the given html string.
+ *
+ * @param {string} docStr
+ * The html to parse.
+ * @returns {Promise<Item[]>}
+ */
+async function collectItems(docStr) {
+ let doc = await parseDocument(docStr);
+ return SchemaOrgPageData.collectItems(doc);
+}
+
+/**
+ * Verifies that the items parsed from the html match the expected JSON-LD
+ * format.
+ *
+ * @param {string} docStr
+ * The html to parse.
+ * @param {object[]} expected
+ * The JSON-LD objects to match to.
+ */
+async function verifyItems(docStr, expected) {
+ let items = await collectItems(docStr);
+ let jsonLD = items.map(item => item.toJsonLD());
+ Assert.deepEqual(jsonLD, expected);
+}
+
+add_task(async function test_microdata_parse() {
+ await verifyItems(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Product Info 1</title>
+ </head>
+ <body itemprop="badprop">
+ <div itemscope itemtype="https://schema.org/Organization">
+ <div itemprop="employee" itemscope itemtype="https://schema.org/Person">
+ <span itemprop="name">Mr. Nested Name</span>
+ </div>
+
+ <span itemprop="name">Mozilla</span>
+ </div>
+
+ <div itemscope itemtype="https://schema.org/Product">
+ <img itemprop="image" src="bon-echo-microwave-17in.jpg" />
+ <a href="microwave.html" itemprop="url">
+ <span itemprop="name">Bon Echo Microwave</span>
+ </a>
+
+ <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
+ <span itemprop="price" content="3.50">£3.50</span>
+ <span itemprop="priceCurrency" content="GBP"></span>
+ </div>
+
+ <span itemprop="gtin" content="13572468"></span>
+
+ <span itemprop="description">The most amazing microwave in the world</span>
+ </div>
+ </body>
+ </html>
+ `,
+ [
+ {
+ "@type": "Organization",
+ employee: {
+ "@type": "Person",
+ name: "Mr. Nested Name",
+ },
+ name: "Mozilla",
+ },
+ {
+ "@type": "Product",
+ image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+ url: BASE_URL + "/microwave.html",
+ name: "Bon Echo Microwave",
+ offers: {
+ "@type": "Offer",
+ price: "3.50",
+ priceCurrency: "GBP",
+ },
+ gtin: "13572468",
+ description: "The most amazing microwave in the world",
+ },
+ ]
+ );
+});
+
+add_task(async function test_json_ld_parse() {
+ await verifyItems(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <script type="application/ld+json">
+ {
+ "@context": "http://schema.org",
+ "@type": "Organization",
+ "employee": {
+ "@type": "Person",
+ "name": "Mr. Nested Name"
+ },
+ "name": "Mozilla"
+ }
+ </script>
+ <script type="application/ld+json">
+ {
+ "@context": "https://schema.org",
+ "@type": "Product",
+ "image": "bon-echo-microwave-17in.jpg",
+ "url": "microwave.html",
+ "name": "Bon Echo Microwave",
+ "offers": {
+ "@type": "Offer",
+ "price": "3.50",
+ "priceCurrency": "GBP"
+ },
+ "gtin": "13572468",
+ "description": "The most amazing microwave in the world"
+ }
+ </script>
+ </head>
+ <body>
+ </body>
+ </html>
+ `,
+ [
+ {
+ "@type": "Organization",
+ employee: {
+ "@type": "Person",
+ name: "Mr. Nested Name",
+ },
+ name: "Mozilla",
+ },
+ {
+ "@type": "Product",
+ image: "bon-echo-microwave-17in.jpg",
+ url: "microwave.html",
+ name: "Bon Echo Microwave",
+ offers: {
+ "@type": "Offer",
+ price: "3.50",
+ priceCurrency: "GBP",
+ },
+ gtin: "13572468",
+ description: "The most amazing microwave in the world",
+ },
+ ]
+ );
+});
+
+add_task(async function test_microdata_lazy_image() {
+ await verifyItems(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <title>Product Info 1</title>
+ </head>
+ <body itemprop="badprop">
+ <div itemscope itemtype="https://schema.org/Product">
+ <img itemprop="image" src="lazy-load.gif" data-src="bon-echo-microwave-17in.jpg" />
+ <a href="microwave.html" itemprop="url">
+ <span itemprop="name">Bon Echo Microwave</span>
+ </a>
+ </div>
+ </body>
+ </html>
+ `,
+ [
+ {
+ "@type": "Product",
+ image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+ url: BASE_URL + "/microwave.html",
+ name: "Bon Echo Microwave",
+ },
+ ]
+ );
+});
diff --git a/browser/components/pagedata/tests/unit/test_twitter.js b/browser/components/pagedata/tests/unit/test_twitter.js
new file mode 100644
index 0000000000..a49491f5c6
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_twitter.js
@@ -0,0 +1,34 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Basic tests for twitter cards.
+ */
+
+add_task(async function test_twitter_card() {
+ await verifyPageData(
+ `
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta name="twitter:card" content="summary_large_image">
+ <meta name="twitter:site" content="@nytimes">
+ <meta name="twitter:creator" content="@SarahMaslinNir">
+ <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral">
+ <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines">
+ <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg">
+ </head>
+ <body>
+ </body>
+ </html>
+ `,
+ {
+ siteName: "@nytimes",
+ description: "NEWARK - The guest list and parade of limousines",
+ image:
+ "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+ data: {},
+ }
+ );
+});
diff --git a/browser/components/pagedata/tests/unit/xpcshell.ini b/browser/components/pagedata/tests/unit/xpcshell.ini
new file mode 100644
index 0000000000..3104e61a86
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/xpcshell.ini
@@ -0,0 +1,14 @@
+[DEFAULT]
+firefox-appdir = browser
+skip-if = toolkit == 'android' # bug 1730213
+support-files =
+ head.js
+head = head.js
+
+[test_pagedata_basic.js]
+[test_pagedata_schema.js]
+[test_opengraph.js]
+[test_queue.js]
+[test_schemaorg.js]
+[test_schemaorg_parse.js]
+[test_twitter.js]