31 files changed, 3667 insertions, 0 deletions
diff --git a/browser/components/pagedata/.eslintrc.js b/browser/components/pagedata/.eslintrc.js
new file mode 100644
index 0000000000..8ead689bcc
--- /dev/null
+++ b/browser/components/pagedata/.eslintrc.js
@@ -0,0 +1,14 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+module.exports = {
+  extends: ["plugin:mozilla/require-jsdoc"],
+
+  rules: {
+    "mozilla/var-only-at-top-level": "error",
+    "no-unused-expressions": "error",
+  },
+};
diff --git a/browser/components/pagedata/OpenGraphPageData.sys.mjs b/browser/components/pagedata/OpenGraphPageData.sys.mjs
new file mode 100644
index 0000000000..8f8b361799
--- /dev/null
+++ b/browser/components/pagedata/OpenGraphPageData.sys.mjs
@@ -0,0 +1,46 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Collects Open Graph (https://opengraphprotocol.org/) related data from a page.
+ */
+export const OpenGraphPageData = {
+  /**
+   * Collects the opengraph data from the page.
+   *
+   * @param {Document} document
+   *   The document to collect from
+   *
+   * @returns {PageData}
+   */
+  collect(document) {
+    let pageData = {};
+
+    // Sites can technically define an Open Graph prefix other than `og:`.
+    // However, `og:` is one of the default RDFa prefixes and it's likely
+    // uncommon that sites use a custom prefix. If we find that metadata is
+    // missing for common sites due to this issue, we could consider adding a
+    // basic RDFa parser.
+    let openGraphTags = document.querySelectorAll("meta[property^='og:'");
+
+    for (let tag of openGraphTags) {
+      // Strip "og:" from the property name.
+      let propertyName = tag.getAttribute("property").substring(3);
+
+      switch (propertyName) {
+        case "description":
+          pageData.description = tag.getAttribute("content");
+          break;
+        case "site_name":
+          pageData.siteName = tag.getAttribute("content");
+          break;
+        case "image":
+          pageData.image = tag.getAttribute("content");
+          break;
+      }
+    }
+
+    return pageData;
+  },
+};
diff --git a/browser/components/pagedata/PageDataChild.sys.mjs b/browser/components/pagedata/PageDataChild.sys.mjs
new file mode 100644
index 0000000000..51dc384526
--- /dev/null
+++ b/browser/components/pagedata/PageDataChild.sys.mjs
@@ -0,0 +1,121 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs",
+  PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs",
+});
+
+// We defer any attempt to check for page data for a short time after a page
+// loads to allow JS to operate.
+XPCOMUtils.defineLazyPreferenceGetter(
+  lazy,
+  "READY_DELAY",
+  "browser.pagedata.readyDelay",
+  500
+);
+
+/**
+ * The actor responsible for monitoring a page for page data.
+ */
+export class PageDataChild extends JSWindowActorChild {
+  #isContentWindowPrivate = true;
+  /**
+   * Used to debounce notifications about a page being ready.
+   *
+   * @type {Timer | null}
+   */
+  #deferTimer = null;
+
+  /**
+   * Called when the actor is created for a new page.
+   */
+  actorCreated() {
+    this.#isContentWindowPrivate =
+      lazy.PrivateBrowsingUtils.isContentWindowPrivate(this.contentWindow);
+  }
+
+  /**
+   * Called when the page is destroyed.
+   */
+  didDestroy() {
+    if (this.#deferTimer) {
+      this.#deferTimer.cancel();
+    }
+  }
+
+  /**
+   * Called when the page has signalled it is done loading. This signal is
+   * debounced by READY_DELAY.
+   */
+  #deferReady() {
+    if (!this.#deferTimer) {
+      this.#deferTimer = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer);
+    }
+
+    // If the timer was already running this re-starts it.
+    this.#deferTimer.initWithCallback(
+      () => {
+        this.#deferTimer = null;
+        this.sendAsyncMessage("PageData:DocumentReady", {
+          url: this.document.documentURI,
+        });
+      },
+      lazy.READY_DELAY,
+      Ci.nsITimer.TYPE_ONE_SHOT_LOW_PRIORITY
+    );
+  }
+
+  /**
+   * Called when a message is received from the parent process.
+   *
+   * @param {ReceiveMessageArgument} msg
+   *   The received message.
+   *
+   * @returns {Promise | undefined}
+   *   A promise for the requested data or undefined if no data was requested.
+   */
+  receiveMessage(msg) {
+    if (this.#isContentWindowPrivate) {
+      return undefined;
+    }
+
+    switch (msg.name) {
+      case "PageData:CheckLoaded":
+        // The service just started in the parent. Check if this document is
+        // already loaded.
+        if (this.document.readystate == "complete") {
+          this.#deferReady();
+        }
+        break;
+      case "PageData:Collect":
+        return lazy.PageDataSchema.collectPageData(this.document);
+    }
+
+    return undefined;
+  }
+
+  /**
+   * DOM event handler.
+   *
+   * @param {Event} event
+   *   The DOM event.
+   */
+  handleEvent(event) {
+    if (this.#isContentWindowPrivate) {
+      return;
+    }
+
+    switch (event.type) {
+      case "DOMContentLoaded":
+      case "pageshow":
+        this.#deferReady();
+        break;
+    }
+  }
+}
diff --git a/browser/components/pagedata/PageDataParent.sys.mjs b/browser/components/pagedata/PageDataParent.sys.mjs
new file mode 100644
index 0000000000..25295adeca
--- /dev/null
+++ b/browser/components/pagedata/PageDataParent.sys.mjs
@@ -0,0 +1,56 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+});
+
+/**
+ * Receives messages from PageDataChild and passes them to the PageData service.
+ */
+export class PageDataParent extends JSWindowActorParent {
+  #deferredCollection = null;
+
+  /**
+   * Starts data collection in the child process. Returns a promise that
+   * resolves to the page data or null if the page is closed before data
+   * collection completes.
+   *
+   * @returns {Promise<PageData|null>}
+   */
+  collectPageData() {
+    if (!this.#deferredCollection) {
+      this.#deferredCollection = Promise.withResolvers();
+      this.sendQuery("PageData:Collect").then(
+        this.#deferredCollection.resolve,
+        this.#deferredCollection.reject
+      );
+    }
+
+    return this.#deferredCollection.promise;
+  }
+
+  /**
+   * Called when the page is destroyed.
+   */
+  didDestroy() {
+    this.#deferredCollection?.resolve(null);
+  }
+
+  /**
+   * Called when a message is received from the content process.
+   *
+   * @param {ReceiveMessageArgument} msg
+   *   The received message.
+   */
+  receiveMessage(msg) {
+    switch (msg.name) {
+      case "PageData:DocumentReady":
+        lazy.PageDataService.pageLoaded(this, msg.data.url);
+        break;
+    }
+  }
+}
diff --git a/browser/components/pagedata/PageDataSchema.sys.mjs b/browser/components/pagedata/PageDataSchema.sys.mjs
new file mode 100644
index 0000000000..ef3907325b
--- /dev/null
+++ b/browser/components/pagedata/PageDataSchema.sys.mjs
@@ -0,0 +1,249 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  JsonSchemaValidator:
+    "resource://gre/modules/components-utils/JsonSchemaValidator.sys.mjs",
+  OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.sys.mjs",
+  SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs",
+  TwitterPageData: "resource:///modules/pagedata/TwitterPageData.sys.mjs",
+});
+
+ChromeUtils.defineLazyGetter(lazy, "logConsole", function () {
+  return console.createInstance({
+    prefix: "PageData",
+    maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false)
+      ? "Debug"
+      : "Warn",
+  });
+});
+
+/**
+ * The list of page data collectors. These should be sorted in order of
+ * specificity, if the same piece of data is provided by two collectors then the
+ * earlier wins.
+ *
+ * Collectors must provide a `collect` function which will be passed the
+ * document object and should return the PageData structure. The function may be
+ * asynchronous if needed.
+ *
+ * The data returned need not be valid, collectors should return whatever they
+ * can and then we drop anything that is invalid once all data is joined.
+ */
+ChromeUtils.defineLazyGetter(lazy, "DATA_COLLECTORS", function () {
+  return [lazy.SchemaOrgPageData, lazy.OpenGraphPageData, lazy.TwitterPageData];
+});
+
+let SCHEMAS = new Map();
+
+/**
+ * Loads the schema for the given name.
+ *
+ * @param {string} schemaName
+ *   The name of the schema to load.
+ */
+async function loadSchema(schemaName) {
+  if (SCHEMAS.has(schemaName)) {
+    return SCHEMAS.get(schemaName);
+  }
+
+  let url = `chrome://browser/content/pagedata/schemas/${schemaName.toLocaleLowerCase()}.schema.json`;
+  let response = await fetch(url);
+  if (!response.ok) {
+    throw new Error(`Failed to load schema: ${response.statusText}`);
+  }
+
+  let schema = await response.json();
+  SCHEMAS.set(schemaName, schema);
+  return schema;
+}
+
+/**
+ * Validates the data using the schema with the given name.
+ *
+ * @param {string} schemaName
+ *   The name of the schema to validate against.
+ * @param {object} data
+ *   The data to validate.
+ */
+async function validateData(schemaName, data) {
+  let schema = await loadSchema(schemaName.toLocaleLowerCase());
+
+  let result = lazy.JsonSchemaValidator.validate(data, schema, {
+    allowExplicitUndefinedProperties: true,
+    // Allowed for future expansion of the schema.
+    allowAdditionalProperties: true,
+  });
+
+  if (!result.valid) {
+    throw result.error;
+  }
+}
+
+/**
+ * A shared API that can be used in parent or child processes
+ */
+export const PageDataSchema = {
+  // Enumeration of data types. The keys must match the schema name.
+  DATA_TYPE: Object.freeze({
+    // Note that 1 and 2 were used as types in earlier versions and should not be used here.
+    PRODUCT: 3,
+    DOCUMENT: 4,
+    ARTICLE: 5,
+    AUDIO: 6,
+    VIDEO: 7,
+  }),
+
+  /**
+   * Gets the data type name.
+   *
+   * @param {DATA_TYPE} type
+   *   The data type from the DATA_TYPE enumeration
+   *
+   * @returns {string | null} The name for the type or null if not found.
+   */
+  nameForType(type) {
+    for (let [name, value] of Object.entries(this.DATA_TYPE)) {
+      if (value == type) {
+        return name;
+      }
+    }
+
+    return null;
+  },
+
+  /**
+   * Asynchronously validates some page data against the expected schema. Throws
+   * an exception if validation fails.
+   *
+   * @param {DATA_TYPE} type
+   *   The data type from the DATA_TYPE enumeration
+   * @param {object} data
+   *   The page data
+   */
+  async validateData(type, data) {
+    let name = this.nameForType(type);
+
+    if (!name) {
+      throw new Error(`Unknown data type ${type}`);
+    }
+
+    return validateData(name, data);
+  },
+
+  /**
+   * Asynchronously validates an entire PageData structure. Any invalid or
+   * unknown data types are dropped.
+   *
+   * @param {PageData} pageData
+   *   The page data
+   *
+   * @returns {PageData} The validated page data structure
+   */
+  async validatePageData(pageData) {
+    let { data: dataMap = {}, ...general } = pageData;
+
+    await validateData("general", general);
+
+    let validData = {};
+
+    for (let [type, data] of Object.entries(dataMap)) {
+      let name = this.nameForType(type);
+      // Ignore unknown types here.
+      if (!name) {
+        continue;
+      }
+
+      try {
+        await validateData(name, data);
+
+        validData[type] = data;
+      } catch (e) {
+        // Invalid data is dropped.
+      }
+    }
+
+    return {
+      ...general,
+      data: validData,
+    };
+  },
+
+  /**
+   * Adds new page data into an existing data set. Any existing data is not
+   * overwritten.
+   *
+   * @param {PageData} existingPageData
+   *   The existing page data
+   * @param {PageData} newPageData
+   *   The new page data
+   *
+   * @returns {PageData} The joined data.
+   */
+  coalescePageData(existingPageData, newPageData) {
+    // Split out the general data from the map of specific data.
+    let { data: existingMap = {}, ...existingGeneral } = existingPageData;
+    let { data: newMap = {}, ...newGeneral } = newPageData;
+
+    Object.assign(newGeneral, existingGeneral);
+
+    let dataMap = {};
+    for (let [type, data] of Object.entries(existingMap)) {
+      if (type in newMap) {
+        dataMap[type] = Object.assign({}, newMap[type], data);
+      } else {
+        dataMap[type] = data;
+      }
+    }
+
+    for (let [type, data] of Object.entries(newMap)) {
+      if (!(type in dataMap)) {
+        dataMap[type] = data;
+      }
+    }
+
+    return {
+      ...newGeneral,
+      data: dataMap,
+    };
+  },
+
+  /**
+   * Collects page data from a DOM document.
+   *
+   * @param {Document} document
+   *   The DOM document to collect data from
+   *
+   * @returns {Promise<PageData | null>} The data collected or null in case of
+   *   error.
+   */
+  async collectPageData(document) {
+    lazy.logConsole.debug("Starting collection", document.documentURI);
+
+    let pending = lazy.DATA_COLLECTORS.map(async collector => {
+      try {
+        return await collector.collect(document);
+      } catch (e) {
+        lazy.logConsole.error("Error collecting page data", e);
+        return null;
+      }
+    });
+
+    let pageDataList = await Promise.all(pending);
+
+    let pageData = pageDataList.reduce(PageDataSchema.coalescePageData, {
+      date: Date.now(),
+      url: document.documentURI,
+    });
+
+    try {
+      return this.validatePageData(pageData);
+    } catch (e) {
+      lazy.logConsole.error("Failed to collect valid page data", e);
+      return null;
+    }
+  },
+};
diff --git a/browser/components/pagedata/PageDataService.sys.mjs b/browser/components/pagedata/PageDataService.sys.mjs
new file mode 100644
index 0000000000..7160705c27
--- /dev/null
+++ b/browser/components/pagedata/PageDataService.sys.mjs
@@ -0,0 +1,677 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+import { EventEmitter } from "resource://gre/modules/EventEmitter.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+  BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.sys.mjs",
+  E10SUtils: "resource://gre/modules/E10SUtils.sys.mjs",
+  HiddenFrame: "resource://gre/modules/HiddenFrame.sys.mjs",
+});
+
+ChromeUtils.defineLazyGetter(lazy, "logConsole", function () {
+  return console.createInstance({
+    prefix: "PageData",
+    maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false)
+      ? "Debug"
+      : "Warn",
+  });
+});
+
+XPCOMUtils.defineLazyServiceGetters(lazy, {
+  idleService: ["@mozilla.org/widget/useridleservice;1", "nsIUserIdleService"],
+});
+
+XPCOMUtils.defineLazyPreferenceGetter(
+  lazy,
+  "fetchIdleTime",
+  "browser.pagedata.fetchIdleTime",
+  300
+);
+
+const ALLOWED_SCHEMES = ["http", "https", "data", "blob"];
+
+const BACKGROUND_WIDTH = 1024;
+const BACKGROUND_HEIGHT = 768;
+
+/**
+ * Shifts the first element out of the set.
+ *
+ * @param {Set<T>} set
+ *   The set containing elements.
+ * @returns {T | undefined} The first element in the set or undefined if
+ *   there is nothing in the set.
+ */
+function shift(set) {
+  let iter = set.values();
+  let { value, done } = iter.next();
+
+  if (done) {
+    return undefined;
+  }
+
+  set.delete(value);
+  return value;
+}
+
+/**
+ * A manager for hidden browsers. Responsible for creating and destroying a
+ * hidden frame to hold them.
+ */
+class HiddenBrowserManager {
+  /**
+   * The hidden frame if one has been created.
+   *
+   * @type {HiddenFrame | null}
+   */
+  #frame = null;
+  /**
+   * The number of hidden browser elements currently in use.
+   *
+   * @type {number}
+   */
+  #browsers = 0;
+
+  /**
+   * Creates and returns a new hidden browser.
+   *
+   * @returns {Browser}
+   */
+  async #acquireBrowser() {
+    this.#browsers++;
+    if (!this.#frame) {
+      this.#frame = new lazy.HiddenFrame();
+    }
+
+    let frame = await this.#frame.get();
+    let doc = frame.document;
+    let browser = doc.createXULElement("browser");
+    browser.setAttribute("remote", "true");
+    browser.setAttribute("type", "content");
+    browser.setAttribute(
+      "style",
+      `
+        width: ${BACKGROUND_WIDTH}px;
+        min-width: ${BACKGROUND_WIDTH}px;
+        height: ${BACKGROUND_HEIGHT}px;
+        min-height: ${BACKGROUND_HEIGHT}px;
+      `
+    );
+    browser.setAttribute("maychangeremoteness", "true");
+    doc.documentElement.appendChild(browser);
+
+    return browser;
+  }
+
+  /**
+   * Releases the given hidden browser.
+   *
+   * @param {Browser} browser
+   *   The hidden browser element.
+   */
+  #releaseBrowser(browser) {
+    browser.remove();
+
+    this.#browsers--;
+    if (this.#browsers == 0) {
+      this.#frame.destroy();
+      this.#frame = null;
+    }
+  }
+
+  /**
+   * Calls a callback function with a new hidden browser.
+   * This function will return whatever the callback function returns.
+   *
+   * @param {Callback} callback
+   *   The callback function will be called with the browser element and may
+   *   be asynchronous.
+   * @returns {T}
+   */
+  async withHiddenBrowser(callback) {
+    let browser = await this.#acquireBrowser();
+    try {
+      return await callback(browser);
+    } finally {
+      this.#releaseBrowser(browser);
+    }
+  }
+}
+
+/**
+ * @typedef {object} CacheEntry
+ *   An entry in the page data cache.
+ * @property {PageData | null} pageData
+ *   The data or null if there is no known data.
+ * @property {Set} actors
+ *   The actors that maintain an interest in keeping the entry cached.
+ */
+
+/**
+ * A cache of page data kept in memory. By default any discovered data from
+ * browsers is kept in memory until the browser element is destroyed but other
+ * actors may register an interest in keeping an entry alive beyond that.
+ */
+class PageDataCache {
+  /**
+   * The contents of the cache. Keyed on page url.
+   *
+   * @type {Map<string, CacheEntry>}
+   */
+  #cache = new Map();
+
+  /**
+   * Creates or updates an entry in the cache. If no actor has registered any
+   * interest in keeping this page's data in memory then this will do nothing.
+   *
+   * @param {string} url
+   *   The url of the page.
+   * @param {PageData|null} pageData
+   *   The current page data for the page.
+   */
+  set(url, pageData) {
+    let entry = this.#cache.get(url);
+
+    if (entry) {
+      entry.pageData = pageData;
+    }
+  }
+
+  /**
+   * Gets any cached data for the url.
+   *
+   * @param {string} url
+   *   The url of the page.
+   * @returns {PageData | null}
+   *   The page data if some is known.
+   */
+  get(url) {
+    let entry = this.#cache.get(url);
+    return entry?.pageData ?? null;
+  }
+
+  /**
+   * Adds a lock to an entry. This can be called before we have discovered the
+   * data for the url.
+   *
+   * @param {object} actor
+   *   Ensures the entry stays in memory until unlocked by this actor.
+   * @param {string} url
+   *   The url of the page.
+   */
+  lockData(actor, url) {
+    let entry = this.#cache.get(url);
+    if (entry) {
+      entry.actors.add(actor);
+    } else {
+      this.#cache.set(url, {
+        pageData: undefined,
+        actors: new Set([actor]),
+      });
+    }
+  }
+
+  /**
+   * Removes a lock from an entry.
+   *
+   * @param {object} actor
+   *   The lock to remove.
+   * @param {string | undefined} [url]
+   *   The url of the page or undefined to unlock all urls locked by this actor.
+   */
+  unlockData(actor, url) {
+    let entries = [];
+    if (url) {
+      let entry = this.#cache.get(url);
+      if (!entry) {
+        return;
+      }
+
+      entries.push([url, entry]);
+    } else {
+      entries = [...this.#cache];
+    }
+
+    for (let [entryUrl, entry] of entries) {
+      if (entry.actors.delete(actor)) {
+        if (entry.actors.size == 0) {
+          this.#cache.delete(entryUrl);
+        }
+      }
+    }
+  }
+}
+
+/**
+ * @typedef {object} PageData
+ *   A set of discovered from a page. Other than the `data` property this is the
+ *   schema at `browser/components/pagedata/schemas/general.schema.json`.
+ * @property {string} url
+ *   The page's url.
+ * @property {number} date
+ *   The epoch based timestamp for when the data was discovered.
+ * @property {string} siteName
+ *   The page's friendly site name.
+ * @property {string} image
+ *   The page's image.
+ * @property {object} data
+ *   The map of data found which may be empty if no data was found. The key in
+ *   map is from the `PageDataSchema.DATA_TYPE` enumeration. The values are in
+ *   the format defined by the schemas at `browser/components/pagedata/schemas`.
+ */
+
+export const PageDataService = new (class PageDataService extends EventEmitter {
+  /**
+   * Caches page data discovered from browsers.
+   *
+   * @type {PageDataCache}
+   */
+  #pageDataCache = new PageDataCache();
+
+  /**
+   * The number of currently running background fetches.
+   *
+   * @type {number}
+   */
+  #backgroundFetches = 0;
+
+  /**
+   * The list of urls waiting to be loaded in the background.
+   *
+   * @type {Set<string>}
+   */
+  #backgroundQueue = new Set();
+
+  /**
+   * Tracks whether the user is currently idle.
+   *
+   * @type {boolean}
+   */
+  #userIsIdle = false;
+
+  /**
+   * A manager for hidden browsers.
+   *
+   * @type {HiddenBrowserManager}
+   */
+  #browserManager = new HiddenBrowserManager();
+
+  /**
+   * A map of hidden browsers to a resolve function that should be passed the
+   * actor that was created for the browser.
+   *
+   * @type {WeakMap<Browser, function(PageDataParent): void>}
+   */
+  #backgroundBrowsers = new WeakMap();
+
+  /**
+   * Tracks windows that have browsers with entries in the cache.
+   *
+   * @type {Map<Window, Set<Browser>>}
+   */
+  #trackedWindows = new Map();
+
+  /**
+   * Constructs the service.
+   */
+  constructor() {
+    super();
+
+    // Limits the number of background fetches that will run at once. Set to 0 to
+    // effectively allow an infinite number.
+    XPCOMUtils.defineLazyPreferenceGetter(
+      this,
+      "MAX_BACKGROUND_FETCHES",
+      "browser.pagedata.maxBackgroundFetches",
+      5,
+      () => this.#startBackgroundWorkers()
+    );
+  }
+
+  /**
+   * Initializes a new instance of the service, not called externally.
+   */
+  init() {
+    if (!Services.prefs.getBoolPref("browser.pagedata.enabled", false)) {
+      return;
+    }
+
+    ChromeUtils.registerWindowActor("PageData", {
+      parent: {
+        esModuleURI: "resource:///actors/PageDataParent.sys.mjs",
+      },
+      child: {
+        esModuleURI: "resource:///actors/PageDataChild.sys.mjs",
+        events: {
+          DOMContentLoaded: {},
+          pageshow: {},
+        },
+      },
+    });
+
+    lazy.logConsole.debug("Service started");
+
+    for (let win of lazy.BrowserWindowTracker.orderedWindows) {
+      if (!win.closed) {
+        // Ask any existing tabs to report
+        for (let tab of win.gBrowser.tabs) {
+          let parent =
+            tab.linkedBrowser.browsingContext?.currentWindowGlobal.getActor(
+              "PageData"
+            );
+
+          parent.sendAsyncMessage("PageData:CheckLoaded");
+        }
+      }
+    }
+
+    lazy.idleService.addIdleObserver(this, lazy.fetchIdleTime);
+  }
+
+  /**
+   * Called when the service is destroyed. This is generally on shutdown so we
+   * don't really need to do much cleanup.
+   */
+  uninit() {
+    lazy.logConsole.debug("Service stopped");
+  }
+
+  /**
+   * Starts tracking for when a browser is destroyed.
+   *
+   * @param {Browser} browser
+   *   The browser to track.
+   */
+  #trackBrowser(browser) {
+    let window = browser.ownerGlobal;
+
+    let browsers = this.#trackedWindows.get(window);
+    if (browsers) {
+      browsers.add(browser);
+
+      // This window is already being tracked, no need to add listeners.
+      return;
+    }
+
+    browsers = new Set([browser]);
+    this.#trackedWindows.set(window, browsers);
+
+    window.addEventListener("unload", () => {
+      for (let closedBrowser of browsers) {
+        this.unlockEntry(closedBrowser);
+      }
+
+      this.#trackedWindows.delete(window);
+    });
+
+    window.addEventListener("TabClose", ({ target: tab }) => {
+      // Unlock any entries locked by this browser.
+      let closedBrowser = tab.linkedBrowser;
+      this.unlockEntry(closedBrowser);
+      browsers.delete(closedBrowser);
+    });
+  }
+
+  /**
+   * Requests that any page data for this url is retained in memory until
+   * unlocked. By calling this you are committing to later call `unlockEntry`
+   * with the same `actor` and `url` parameters.
+   *
+   * @param {object} actor
+   *   The actor requesting the lock.
+   * @param {string} url
+   *   The url of the page to lock.
+   */
+  lockEntry(actor, url) {
+    this.#pageDataCache.lockData(actor, url);
+  }
+
+  /**
+   * Notifies that an actor is no longer interested in a url.
+   *
+   * @param {object} actor
+   *   The actor that requested the lock.
+   * @param {string | undefined} [url]
+   *   The url of the page or undefined to unlock all urls locked by this actor.
+   */
+  unlockEntry(actor, url) {
+    this.#pageDataCache.unlockData(actor, url);
+  }
+
+  /**
+   * Called when the content process signals that a page is ready for data
+   * collection.
+   *
+   * @param {PageDataParent} actor
+   *   The parent actor for the page.
+   * @param {string} url
+   *   The url of the page.
+   */
+  async pageLoaded(actor, url) {
+    let uri = Services.io.newURI(url);
+    if (!ALLOWED_SCHEMES.includes(uri.scheme)) {
+      return;
+    }
+
+    let browser = actor.browsingContext?.embedderElement;
+
+    // If we don't have a browser then it went away before we could record,
+    // so we don't know where the data came from.
+    if (!browser) {
+      return;
+    }
+
+    // Is this a load in a background browser?
+    let backgroundResolve = this.#backgroundBrowsers.get(browser);
+    if (backgroundResolve) {
+      backgroundResolve(actor);
+      return;
+    }
+
+    // Otherwise we only care about pages loaded in the tab browser.
+    if (!this.#isATabBrowser(browser)) {
+      return;
+    }
+
+    try {
+      let data = await actor.collectPageData();
+      if (data) {
+        // Keep this data alive until the browser is destroyed.
+        this.#trackBrowser(browser);
+        this.lockEntry(browser, data.url);
+
+        this.pageDataDiscovered(data);
+      }
+    } catch (e) {
+      lazy.logConsole.error(e);
+    }
+  }
+
+  /**
+   * Adds data for a url. This should generally only be called by other components of the
+   * page data service or tests for simulating page data collection.
+   *
+   * @param {PageData} pageData
+   *   The set of data discovered.
+   */
+  pageDataDiscovered(pageData) {
+    lazy.logConsole.debug("Discovered page data", pageData);
+
+    this.#pageDataCache.set(pageData.url, {
+      ...pageData,
+      data: pageData.data ?? {},
+    });
+
+    // Send out a notification.
+    this.emit("page-data", pageData);
+  }
+
+  /**
+   * Retrieves any cached page data. Returns null if there is no information in the cache, this will
+   * happen either if the page has not been browsed recently or if data collection failed for some
+   * reason.
+   *
+   * @param {string} url
+   *   The url to retrieve data for.
+   * @returns {PageData|null}
+   *   A `PageData` if one is cached (it may not actually contain any items of data) or null if this
+   *   page has not been successfully checked for data recently.
+   */
+  getCached(url) {
+    return this.#pageDataCache.get(url);
+  }
+
+  /**
+   * Fetches page data from the given URL using a hidden window. Note that this does not populate
+   * the page data cache or emit the `page-data` event.
+   *
+   * @param {string} url
+   *   The url to retrieve data for.
+   * @returns {Promise<PageData|null>}
+   *   Resolves to the found pagedata or null in case of error.
+   */
+  async fetchPageData(url) {
+    return this.#browserManager.withHiddenBrowser(async browser => {
+      try {
+        let { promise, resolve } = Promise.withResolvers();
+        this.#backgroundBrowsers.set(browser, resolve);
+
+        let principal = Services.scriptSecurityManager.getSystemPrincipal();
+        let oa = lazy.E10SUtils.predictOriginAttributes({
+          browser,
+        });
+        let loadURIOptions = {
+          triggeringPrincipal: principal,
+          remoteType: lazy.E10SUtils.getRemoteTypeForURI(
+            url,
+            true,
+            false,
+            lazy.E10SUtils.DEFAULT_REMOTE_TYPE,
+            null,
+            oa
+          ),
+        };
+        browser.fixupAndLoadURIString(url, loadURIOptions);
+
+        let actor = await promise;
+        return await actor.collectPageData();
+      } finally {
+        this.#backgroundBrowsers.delete(browser);
+      }
+    });
+  }
+
+  /**
+   * Handles notifications from the idle service.
+   *
+   * @param {nsISupports} subject
+   *   The notification's subject.
+   * @param {string} topic
+   *   The notification topic.
+   * @param {string} data
+   *   The data associated with the notification.
+   */
+  observe(subject, topic, data) {
+    switch (topic) {
+      case "idle":
+        lazy.logConsole.debug("User went idle");
+        this.#userIsIdle = true;
+        this.#startBackgroundWorkers();
+        break;
+      case "active":
+        lazy.logConsole.debug("User became active");
+        this.#userIsIdle = false;
+        break;
+    }
+  }
+
+  /**
+   * Starts as many background workers as are allowed to process the background
+   * queue.
+   */
+  #startBackgroundWorkers() {
+    if (!this.#userIsIdle) {
+      return;
+    }
+
+    let toStart;
+
+    if (this.MAX_BACKGROUND_FETCHES) {
+      toStart = this.MAX_BACKGROUND_FETCHES - this.#backgroundFetches;
+    } else {
+      toStart = this.#backgroundQueue.size;
+    }
+
+    for (let i = 0; i < toStart; i++) {
+      this.#backgroundFetch();
+    }
+  }
+
+  /**
+   * Starts a background fetch worker which will pull urls from the queue and
+   * load them until the queue is empty.
+   */
+  async #backgroundFetch() {
+    this.#backgroundFetches++;
+
+    let url = shift(this.#backgroundQueue);
+    while (url) {
+      try {
+        let pageData = await this.fetchPageData(url);
+
+        if (pageData) {
+          this.#pageDataCache.set(url, pageData);
+          this.emit("page-data", pageData);
+        }
+      } catch (e) {
+        lazy.logConsole.error(e);
+      }
+
+      // Check whether the user became active or the worker limit changed
+      // dynamically.
+      if (
+        !this.#userIsIdle ||
+        (this.MAX_BACKGROUND_FETCHES > 0 &&
+          this.#backgroundFetches > this.MAX_BACKGROUND_FETCHES)
+      ) {
+        break;
+      }
+
+      url = shift(this.#backgroundQueue);
+    }
+
+    this.#backgroundFetches--;
+  }
+
+  /**
+   * Queues page data retrieval for a url. The page-data notification will be
+   * generated if data becomes available.
+   *
+   * Check `getCached` first to ensure that data is not already in the cache.
+   *
+   * @param {string} url
+   *   The url to retrieve data for.
+   */
+  queueFetch(url) {
+    this.#backgroundQueue.add(url);
+
+    this.#startBackgroundWorkers();
+  }
+
+  /**
+   * Determines if the given browser is contained within a tab.
+   *
+   * @param {DOMElement} browser
+   *   The browser element to check.
+   * @returns {boolean}
+   *   True if the browser element is contained within a tab.
+   */
+  #isATabBrowser(browser) {
+    return browser.ownerGlobal.gBrowser?.getTabForBrowser(browser);
+  }
+})();
diff --git a/browser/components/pagedata/SchemaOrgPageData.sys.mjs b/browser/components/pagedata/SchemaOrgPageData.sys.mjs
new file mode 100644
index 0000000000..449572c76f
--- /dev/null
+++ b/browser/components/pagedata/SchemaOrgPageData.sys.mjs
@@ -0,0 +1,441 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { PageDataSchema } from "resource:///modules/pagedata/PageDataSchema.sys.mjs";
+
+/**
+ * Represents an item from the schema.org specification.
+ *
+ * Every `Item` has a type and a set of properties. Each property has a string
+ * name and a list of values. It often isn't clear from the spec whether a
+ * property is expected to have a list of values or just one value so this
+ * data structure stores every property as a list and provides a simple method
+ * to get the first property value.
+ */
+class Item {
+  /** @type {string} The type of the item e.g. "Product" or "Person". */
+  type;
+
+  /** @type {Map<string, any[]>} Properties of the item. */
+  properties = new Map();
+
+  /**
+   * Constructors a new `Item` of the given type.
+   *
+   * @param {string} type
+   *   The type of the item.
+   */
+  constructor(type) {
+    this.type = type;
+  }
+
+  /**
+   * Tests whether a property has any values in this item.
+   *
+   * @param {string} prop
+   *   The name of the property.
+   * @returns {boolean}
+   */
+  has(prop) {
+    return this.properties.has(prop);
+  }
+
+  /**
+   * Gets all of the values for a property. This may return an empty array if
+   * there are no values.
+   *
+   * @param {string} prop
+   *   The name of the property.
+   * @returns {any[]}
+   */
+  all(prop) {
+    return this.properties.get(prop) ?? [];
+  }
+
+  /**
+   * Gets the first value for a property.
+   *
+   * @param {string} prop
+   *   The name of the property.
+   * @returns {any}
+   */
+  get(prop) {
+    return this.properties.get(prop)?.[0];
+  }
+
+  /**
+   * Sets a value for a property.
+   *
+   * @param {string} prop
+   *   The name of the property.
+   * @param {any} value
+   *   The value of the property.
+   */
+  set(prop, value) {
+    let props = this.properties.get(prop);
+    if (props === undefined) {
+      props = [];
+      this.properties.set(prop, props);
+    }
+
+    props.push(value);
+  }
+
+  /**
+   * Converts this item to JSON-LD.
+   *
+   * Single array properties are converted into simple properties.
+   *
+   * @returns {object}
+   */
+  toJsonLD() {
+    /**
+     * Converts a value to its JSON-LD representation.
+     *
+     * @param {any} val
+     *   The value to convert.
+     * @returns {any}
+     */
+    function toLD(val) {
+      if (val instanceof Item) {
+        return val.toJsonLD();
+      }
+      return val;
+    }
+
+    let props = Array.from(this.properties, ([key, value]) => {
+      if (value.length == 1) {
+        return [key, toLD(value[0])];
+      }
+
+      return [key, value.map(toLD)];
+    });
+
+    return {
+      "@type": this.type,
+      ...Object.fromEntries(props),
+    };
+  }
+}
+
+/**
+ * Parses the value for a given microdata property.
+ * See https://html.spec.whatwg.org/multipage/microdata.html#values for the parsing spec
+ *
+ * @param {Element} propElement
+ *   The property element.
+ * @returns {any}
+ *   The value of the property.
+ */
+function parseMicrodataProp(propElement) {
+  if (propElement.hasAttribute("itemscope")) {
+    throw new Error(
+      "Cannot parse a simple property value from an itemscope element."
+    );
+  }
+
+  const parseUrl = (urlElement, attr) => {
+    if (!urlElement.hasAttribute(attr)) {
+      return "";
+    }
+
+    try {
+      let url = new URL(
+        urlElement.getAttribute(attr),
+        urlElement.ownerDocument.documentURI
+      );
+      return url.toString();
+    } catch (e) {
+      return "";
+    }
+  };
+
+  switch (propElement.localName) {
+    case "meta":
+      return propElement.getAttribute("content") ?? "";
+    case "audio":
+    case "embed":
+    case "iframe":
+    case "source":
+    case "track":
+    case "video":
+      return parseUrl(propElement, "src");
+    case "img":
+      // Some pages may be using a lazy loading approach to images, putting a
+      // temporary image in "src" while the real image is in a differently
+      // named attribute. So far we found "content" and "data-src" are common
+      // names for that attribute.
+      return (
+        parseUrl(propElement, "content") ||
+        parseUrl(propElement, "data-src") ||
+        parseUrl(propElement, "src")
+      );
+    case "object":
+      return parseUrl(propElement, "data");
+    case "a":
+    case "area":
+    case "link":
+      return parseUrl(propElement, "href");
+    case "data":
+    case "meter":
+      return propElement.getAttribute("value");
+    case "time":
+      if (propElement.hasAtribute("datetime")) {
+        return propElement.getAttribute("datetime");
+      }
+      return propElement.textContent;
+    default:
+      // Not mentioned in the spec but sites seem to use it.
+      if (propElement.hasAttribute("content")) {
+        return propElement.getAttribute("content");
+      }
+      return propElement.textContent;
+  }
+}
+
+/**
+ * Collects product data from an item.
+ *
+ * @param {Document} document
+ *   The document the item comes from.
+ * @param {PageData} pageData
+ *   The pageData object to add to.
+ * @param {Item} item
+ *   The product item.
+ */
+function collectProduct(document, pageData, item) {
+  if (item.has("image")) {
+    let url = new URL(item.get("image"), document.documentURI);
+    pageData.image = url.toString();
+  }
+
+  if (item.has("description")) {
+    pageData.description = item.get("description");
+  }
+
+  pageData.data[PageDataSchema.DATA_TYPE.PRODUCT] = {
+    name: item.get("name"),
+  };
+
+  for (let offer of item.all("offers")) {
+    if (!(offer instanceof Item) || offer.type != "Offer") {
+      continue;
+    }
+
+    let price = parseFloat(offer.get("price"));
+    if (!isNaN(price)) {
+      pageData.data[PageDataSchema.DATA_TYPE.PRODUCT].price = {
+        value: price,
+        currency: offer.get("priceCurrency"),
+      };
+
+      break;
+    }
+  }
+}
+
+/**
+ * Returns the root microdata items from the given document.
+ *
+ * @param {Document} document
+ *   The DOM document to collect from.
+ * @returns {Item[]}
+ */
+function collectMicrodataItems(document) {
+  // First find all of the items in the document.
+  let itemElements = document.querySelectorAll(
+    "[itemscope][itemtype^='https://schema.org/'], [itemscope][itemtype^='http://schema.org/']"
+  );
+
+  /**
+   * Maps elements to the closest item.
+   *
+   * @type {Map<Element, Item>}
+   */
+  let items = new Map();
+
+  /**
+   * Finds the item for an element. Throws if there is no item. Caches the
+   * result.
+   *
+   * @param {Element} element
+   *   The element to search from.
+   * @returns {Item}
+   */
+  function itemFor(element) {
+    let item = items.get(element);
+    if (item) {
+      return item;
+    }
+
+    if (!element.parentElement) {
+      throw new Error("Element has no parent item.");
+    }
+
+    item = itemFor(element.parentElement);
+    items.set(element, item);
+    return item;
+  }
+
+  for (let element of itemElements) {
+    let itemType = element.getAttribute("itemtype");
+    // Strip off the base url
+    if (itemType.startsWith("https://")) {
+      itemType = itemType.substring(19);
+    } else {
+      itemType = itemType.substring(18);
+    }
+
+    items.set(element, new Item(itemType));
+  }
+
+  // The initial roots are just all the items.
+  let roots = new Set(items.values());
+
+  // Now find all item properties.
+  let itemProps = document.querySelectorAll(
+    "[itemscope][itemtype^='https://schema.org/'] [itemprop], [itemscope][itemtype^='http://schema.org/'] [itemprop]"
+  );
+
+  for (let element of itemProps) {
+    // The item is always defined above the current element.
+    let item = itemFor(element.parentElement);
+
+    // The properties value is either a nested item or a simple value.
+    let propValue = items.get(element) ?? parseMicrodataProp(element);
+    item.set(element.getAttribute("itemprop"), propValue);
+
+    if (propValue instanceof Item) {
+      // This item belongs to another item and so is not a root item.
+      roots.delete(propValue);
+    }
+  }
+
+  return [...roots];
+}
+
+/**
+ * Returns the root JSON-LD items from the given document.
+ *
+ * @param {Document} document
+ *   The DOM document to collect from.
+ * @returns {Item[]}
+ */
+function collectJsonLDItems(document) {
+  /**
+   * The root items.
+   *
+   * @type {Item[]}
+   */
+  let items = [];
+
+  /**
+   * Converts a JSON-LD value into an Item if appropriate.
+   *
+   * @param {any} val
+   *   The value to convert.
+   * @returns {any}
+   */
+  function fromLD(val) {
+    if (typeof val == "object" && "@type" in val) {
+      let item = new Item(val["@type"]);
+
+      for (let [prop, value] of Object.entries(val)) {
+        // Ignore meta properties.
+        if (prop.startsWith("@")) {
+          continue;
+        }
+
+        if (!Array.isArray(value)) {
+          value = [value];
+        }
+
+        item.properties.set(prop, value.map(fromLD));
+      }
+
+      return item;
+    }
+
+    return val;
+  }
+
+  let scripts = document.querySelectorAll("script[type='application/ld+json'");
+  for (let script of scripts) {
+    try {
+      let content = JSON.parse(script.textContent);
+
+      if (typeof content != "object") {
+        continue;
+      }
+
+      if (!("@context" in content)) {
+        continue;
+      }
+
+      if (
+        content["@context"] != "http://schema.org" &&
+        content["@context"] != "https://schema.org"
+      ) {
+        continue;
+      }
+
+      let item = fromLD(content);
+      if (item instanceof Item) {
+        items.push(item);
+      }
+    } catch (e) {
+      // Unparsable content.
+    }
+  }
+
+  return items;
+}
+
+/**
+ * Collects schema.org related data from a page.
+ *
+ * Currently only supports HTML Microdata and JSON-LD formats, not RDFa.
+ */
+export const SchemaOrgPageData = {
+  /**
+   * Parses and collects the schema.org items from the given document.
+   * The returned items are the roots, i.e. the top-level items, there may be
+   * other items as nested properties.
+   *
+   * @param {Document} document
+   *   The DOM document to parse.
+   * @returns {Item[]}
+   */
+  collectItems(document) {
+    return collectMicrodataItems(document).concat(collectJsonLDItems(document));
+  },
+
+  /**
+   * Performs PageData collection from the given document.
+   *
+   * @param {Document} document
+   *   The DOM document to collect from.
+   * @returns {PageData}
+   */
+  collect(document) {
+    let pageData = { data: {} };
+
+    let items = this.collectItems(document);
+
+    for (let item of items) {
+      switch (item.type) {
+        case "Product":
+          if (!(PageDataSchema.DATA_TYPE.PRODUCT in pageData.data)) {
+            collectProduct(document, pageData, item);
+          }
+          break;
+        case "Organization":
+          pageData.siteName = item.get("name");
+          break;
+      }
+    }
+
+    return pageData;
+  },
+};
diff --git a/browser/components/pagedata/TwitterPageData.sys.mjs b/browser/components/pagedata/TwitterPageData.sys.mjs
new file mode 100644
index 0000000000..88b06098cb
--- /dev/null
+++ b/browser/components/pagedata/TwitterPageData.sys.mjs
@@ -0,0 +1,42 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Collects Twitter card (https://developer.twitter.com/en/docs/twitter-for-websites/)
+ * related data from a page.
+ */
+export const TwitterPageData = {
+  /**
+   * Collects the twitter data from the page.
+   *
+   * @param {Document} document
+   *   The document to collect from
+   *
+   * @returns {PageData}
+   */
+  collect(document) {
+    let pageData = {};
+
+    let twitterTags = document.querySelectorAll("meta[name^='twitter:'");
+
+    for (let tag of twitterTags) {
+      // Strip "twitter:" from the property name.
+      let propertyName = tag.getAttribute("name").substring(8);
+
+      switch (propertyName) {
+        case "site":
+          pageData.siteName = tag.getAttribute("content");
+          break;
+        case "description":
+          pageData.description = tag.getAttribute("content");
+          break;
+        case "image":
+          pageData.image = tag.getAttribute("content");
+          break;
+      }
+    }
+
+    return pageData;
+  },
+};
diff --git a/browser/components/pagedata/docs/index.md b/browser/components/pagedata/docs/index.md
new file mode 100644
index 0000000000..47b507d13a
--- /dev/null
+++ b/browser/components/pagedata/docs/index.md
@@ -0,0 +1,50 @@
+# PageDataService
+
+The page data service is responsible for collecting additional data about a page. This could include
+information about the media on a page, product information, etc. When enabled it will automatically
+try to find page data for pages that the user browses or it can be directed to asynchronously look
+up the page data for a url.
+
+The `PageDataService` is an EventEmitter and listeners can subscribe to its notifications via the
+`on` and `once` methods.
+
+The service can be enabled by setting `browser.pagedata.enabled` to true. Additional logging can be
+enabled by setting `browser.pagedata.log` to true.
+
+## PageData Data Structure
+
+At a high level the page data service can collect many different kinds of data. When queried the
+service will respond with a `PageData` structure which holds some general information about the
+page, the time when the data was discovered and a map of the different types of data found. This map
+will be empty if no specific data was found. The key of the map is from the
+`PageDataSchema.DATA_TYPE` enumeration. The value is the JSON data which differs in structure
+depending on the data type.
+
+```
+{
+  "url": <url of the page as a string>,
+  "date": <epoch based timestamp>,
+  "siteName": <a friendly name for the website>,
+  "image": <url for an image for the page as a string>,
+  "data": <map of data types>,
+}
+```
+
+## PageData Collection
+
+Page data is gathered in one of two ways.
+
+Page data is automatically gathered for webpages the user visits. This collection is trigged after
+a short delay and then updated when necessary. Any data is cached in memory for a period of time.
+When page data has been found a `page-data` event is emitted. The event's argument holds the
+`PageData` structure. The `getCached` function can be used to access any cached data for a url.
+
+## Supported Types of page data
+
+The following types of page data (`PageDataSchema.DATA_TYPE`) are currently supported:
+
+- `PRODUCT`
+- `DOCUMENT`
+- `ARTICLE`
+- `AUDIO`
+- `VIDEO`
diff --git a/browser/components/pagedata/jar.mn b/browser/components/pagedata/jar.mn
new file mode 100644
index 0000000000..19860a30ee
--- /dev/null
+++ b/browser/components/pagedata/jar.mn
@@ -0,0 +1,6 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+browser.jar:
+    content/browser/pagedata/schemas/ (schemas/*.json)
diff --git a/browser/components/pagedata/moz.build b/browser/components/pagedata/moz.build
new file mode 100644
index 0000000000..f1e49c4e4b
--- /dev/null
+++ b/browser/components/pagedata/moz.build
@@ -0,0 +1,29 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+XPCSHELL_TESTS_MANIFESTS += [
+    "tests/unit/xpcshell.toml",
+]
+BROWSER_CHROME_MANIFESTS += [
+    "tests/browser/browser.toml",
+]
+
+JAR_MANIFESTS += ["jar.mn"]
+
+EXTRA_JS_MODULES.pagedata += [
+    "OpenGraphPageData.sys.mjs",
+    "PageDataSchema.sys.mjs",
+    "PageDataService.sys.mjs",
+    "SchemaOrgPageData.sys.mjs",
+    "TwitterPageData.sys.mjs",
+]
+
+FINAL_TARGET_FILES.actors += [
+    "PageDataChild.sys.mjs",
+    "PageDataParent.sys.mjs",
+]
+
+SPHINX_TREES["docs"] = "docs"
diff --git a/browser/components/pagedata/schemas/article.schema.json b/browser/components/pagedata/schemas/article.schema.json
new file mode 100644
index 0000000000..e02bb11655
--- /dev/null
+++ b/browser/components/pagedata/schemas/article.schema.json
@@ -0,0 +1,26 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "article.schema.json",
+  "title": "Article",
+  "description": "An article for reading",
+  "type": "object",
+  "properties": {
+    "name": {
+      "description": "The article's name",
+      "type": "string"
+    },
+    "author": {
+      "description": "The author(s) of the article",
+      "type": "string"
+    },
+    "date": {
+      "description": "The date the article was published in ISO-8601 date or date/time format",
+      "type": "string"
+    },
+    "readingTime": {
+      "description": "The expected time to read the article in seconds",
+      "type": "number"
+    }
+  },
+  "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/audio.schema.json b/browser/components/pagedata/schemas/audio.schema.json
new file mode 100644
index 0000000000..db1b79b55c
--- /dev/null
+++ b/browser/components/pagedata/schemas/audio.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "audio.schema.json",
+  "title": "Audio",
+  "description": "An audio file",
+  "type": "object",
+  "properties": {
+    "name": {
+      "description": "The audio's name",
+      "type": "string"
+    },
+    "duration": {
+      "description": "The audio's duration in seconds",
+      "type": "number"
+    },
+    "artist": {
+      "description": "The artist who created the audio",
+      "type": "string"
+    },
+    "album": {
+      "description": "For music on an album the name of the album",
+      "type": "string"
+    },
+    "track": {
+      "description": "For music on an album the number of the track on the album",
+      "type": "number"
+    },
+    "genre": {
+      "description": "The genre of the audio",
+      "type": "string"
+    }
+  },
+  "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/document.schema.json b/browser/components/pagedata/schemas/document.schema.json
new file mode 100644
index 0000000000..849010773b
--- /dev/null
+++ b/browser/components/pagedata/schemas/document.schema.json
@@ -0,0 +1,18 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "document.schema.json",
+  "title": "Document",
+  "description": "A document of some kind, either viewable or editable",
+  "type": "object",
+  "properties": {
+    "name": {
+      "description": "The document's name",
+      "type": "string"
+    },
+    "mimeType": {
+      "description": "The document's mimetype",
+      "type": "string"
+    }
+  },
+  "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/general.schema.json b/browser/components/pagedata/schemas/general.schema.json
new file mode 100644
index 0000000000..a400fd889b
--- /dev/null
+++ b/browser/components/pagedata/schemas/general.schema.json
@@ -0,0 +1,30 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "general.schema.json",
+  "title": "General",
+  "description": "General data about a page",
+  "type": "object",
+  "properties": {
+    "url": {
+      "description": "The page's url",
+      "type": "string"
+    },
+    "date": {
+      "description": "The date the data was collected as a timestamp",
+      "type": "number"
+    },
+    "description": {
+      "description": "A description of the page",
+      "type": "string"
+    },
+    "siteName": {
+      "description": "A friendly name for the site",
+      "type": "string"
+    },
+    "image": {
+      "description": "The url for an image representative of the page",
+      "type": "string"
+    }
+  },
+  "required": ["url", "date"]
+}
diff --git a/browser/components/pagedata/schemas/product.schema.json b/browser/components/pagedata/schemas/product.schema.json
new file mode 100644
index 0000000000..77bec76ff2
--- /dev/null
+++ b/browser/components/pagedata/schemas/product.schema.json
@@ -0,0 +1,46 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "product.schema.json",
+  "title": "Product",
+  "description": "A product that can be purchased",
+  "type": "object",
+  "properties": {
+    "name": {
+      "description": "The product's name",
+      "type": "string"
+    },
+    "brand": {
+      "description": "The product's brand",
+      "type": "string"
+    },
+    "price": {
+      "description": "The cost of a single unit",
+      "type": "object",
+      "properties": {
+        "value": {
+          "type": "number"
+        },
+        "currency": {
+          "description": "The currency for the value",
+          "type": "string"
+        }
+      },
+      "required": ["value"]
+    },
+    "shippingCost": {
+      "description": "The cost of shipping",
+      "type": "object",
+      "properties": {
+        "value": {
+          "type": "number"
+        },
+        "currency": {
+          "description": "The currency for the value",
+          "type": "string"
+        }
+      },
+      "required": ["value"]
+    }
+  },
+  "required": ["name"]
+}
diff --git a/browser/components/pagedata/schemas/video.schema.json b/browser/components/pagedata/schemas/video.schema.json
new file mode 100644
index 0000000000..1091ebfe89
--- /dev/null
+++ b/browser/components/pagedata/schemas/video.schema.json
@@ -0,0 +1,38 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "video.schema.json",
+  "title": "Video",
+  "description": "A video",
+  "type": "object",
+  "properties": {
+    "name": {
+      "description": "The video's name",
+      "type": "string"
+    },
+    "duration": {
+      "description": "The video's duration in seconds",
+      "type": "number"
+    },
+    "quality": {
+      "description": "A short description of the video's quality (e.g. 'HD', '720p')",
+      "type": "string"
+    },
+    "show": {
+      "description": "For an episode of a TV show the name of the TV show",
+      "type": "string"
+    },
+    "season": {
+      "description": "For an episode of a TV show the season number it appears in",
+      "type": "number"
+    },
+    "episode": {
+      "description": "For an episode of a TV show the number of the episode in the season",
+      "type": "number"
+    },
+    "genre": {
+      "description": "The genre of the video",
+      "type": "string"
+    }
+  },
+  "required": ["name"]
+}
diff --git a/browser/components/pagedata/tests/browser/browser.toml b/browser/components/pagedata/tests/browser/browser.toml
new file mode 100644
index 0000000000..8bcd7a539b
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser.toml
@@ -0,0 +1,16 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+[DEFAULT]
+prefs = [
+  "browser.pagedata.log=true",
+  "browser.pagedata.enabled=true",
+]
+support-files = ["head.js"]
+
+["browser_pagedata_background.js"]
+
+["browser_pagedata_basic.js"]
+
+["browser_pagedata_cache.js"]
diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_background.js b/browser/components/pagedata/tests/browser/browser_pagedata_background.js
new file mode 100644
index 0000000000..bba2ae2e47
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_background.js
@@ -0,0 +1,48 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Background load tests for the page data service.
+ */
+
+const TEST_URL =
+  "data:text/html," +
+  encodeURIComponent(`
+    <html>
+    <head>
+      <meta name="twitter:card" content="summary_large_image">
+      <meta name="twitter:site" content="@nytimes">
+      <meta name="twitter:creator" content="@SarahMaslinNir">
+      <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral">
+      <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines">
+      <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg">
+    </head>
+    <body>
+    </body>
+    </html>
+`);
+
+add_task(async function test_pagedata_no_data() {
+  let pageData = await PageDataService.fetchPageData(TEST_URL);
+
+  delete pageData.date;
+  Assert.deepEqual(
+    pageData,
+    {
+      url: TEST_URL,
+      siteName: "@nytimes",
+      description: "NEWARK - The guest list and parade of limousines",
+      image:
+        "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+      data: {},
+    },
+    "Should have returned the right data"
+  );
+
+  Assert.equal(
+    PageDataService.getCached(TEST_URL),
+    null,
+    "Should not have cached this data"
+  );
+});
diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_basic.js b/browser/components/pagedata/tests/browser/browser_pagedata_basic.js
new file mode 100644
index 0000000000..4984645274
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_basic.js
@@ -0,0 +1,64 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Basic tests for the page data service.
+ */
+
+const TEST_URL = "https://example.com/";
+const TEST_URL2 = "https://example.com/browser";
+
+add_task(async function test_pagedata_no_data() {
+  let promise = PageDataService.once("page-data");
+
+  await BrowserTestUtils.withNewTab(TEST_URL, async browser => {
+    let pageData = await promise;
+    Assert.equal(pageData.url, TEST_URL, "Should have returned the loaded URL");
+    Assert.deepEqual(pageData.data, {}, "Should have returned no data");
+    Assert.deepEqual(
+      PageDataService.getCached(TEST_URL),
+      pageData,
+      "Should return the same data from the cache"
+    );
+
+    promise = PageDataService.once("page-data");
+    BrowserTestUtils.startLoadingURIString(browser, TEST_URL2);
+    await BrowserTestUtils.browserLoaded(browser, false, TEST_URL2);
+    pageData = await promise;
+    Assert.equal(
+      pageData.url,
+      TEST_URL2,
+      "Should have returned the loaded URL"
+    );
+    Assert.deepEqual(pageData.data, {}, "Should have returned no data");
+    Assert.deepEqual(
+      PageDataService.getCached(TEST_URL2),
+      pageData,
+      "Should return the same data from the cache"
+    );
+
+    info("Test going back still triggers collection");
+
+    promise = PageDataService.once("page-data");
+    let locationChangePromise = BrowserTestUtils.waitForLocationChange(
+      gBrowser,
+      TEST_URL
+    );
+    browser.goBack();
+    await locationChangePromise;
+    pageData = await promise;
+
+    Assert.equal(
+      pageData.url,
+      TEST_URL,
+      "Should have returned the URL of the previous page"
+    );
+    Assert.deepEqual(pageData.data, {}, "Should have returned no data");
+    Assert.deepEqual(
+      PageDataService.getCached(TEST_URL),
+      pageData,
+      "Should return the same data from the cache"
+    );
+  });
+});
diff --git a/browser/components/pagedata/tests/browser/browser_pagedata_cache.js b/browser/components/pagedata/tests/browser/browser_pagedata_cache.js
new file mode 100644
index 0000000000..e41b4ea2f8
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/browser_pagedata_cache.js
@@ -0,0 +1,155 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests for the page data cache.
+ */
+
+const TEST_URL =
+  "data:text/html," +
+  encodeURIComponent(`
+    <!DOCTYPE html>
+    <html>
+    <head>
+      <meta charset="utf-8">
+      <meta name="twitter:card" content="summary_large_image">
+      <meta name="twitter:site" content="@nytimes">
+      <meta name="twitter:creator" content="@SarahMaslinNir">
+      <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral">
+      <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines">
+      <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg">
+    </head>
+    <body>
+    </body>
+    </html>
+`);
+
+/**
+ * Runs a task with a new page loaded into a tab in a new browser window.
+ *
+ * @param {string} url
+ *   The url to load.
+ * @param {Function} task
+ *   The task to run. May return a promise.
+ */
+async function withBrowserInNewWindow(url, task) {
+  let newWin = await BrowserTestUtils.openNewBrowserWindow();
+  let tab = await BrowserTestUtils.openNewForegroundTab(newWin.gBrowser, url);
+  await task(tab.linkedBrowser);
+  await BrowserTestUtils.closeWindow(newWin);
+}
+
+add_task(async function test_pagedata_cache() {
+  let promise = PageDataService.once("page-data");
+
+  Assert.equal(
+    PageDataService.getCached(TEST_URL),
+    null,
+    "Should be no data cached."
+  );
+
+  await BrowserTestUtils.withNewTab(TEST_URL, async () => {
+    let pageData = await promise;
+
+    Assert.deepEqual(
+      PageDataService.getCached(TEST_URL),
+      pageData,
+      "Should return the same data from the cache"
+    );
+
+    delete pageData.date;
+
+    Assert.deepEqual(
+      pageData,
+      {
+        url: TEST_URL,
+        siteName: "@nytimes",
+        description: "NEWARK - The guest list and parade of limousines",
+        image:
+          "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+        data: {},
+      },
+      "Should have returned the right data"
+    );
+  });
+
+  Assert.equal(
+    PageDataService.getCached(TEST_URL),
+    null,
+    "Data should no longer be cached."
+  );
+
+  promise = PageDataService.once("page-data");
+
+  // Checks that closing a window containing a tracked tab stops tracking the tab.
+  await withBrowserInNewWindow(TEST_URL, async () => {
+    let pageData = await promise;
+
+    Assert.deepEqual(
+      PageDataService.getCached(TEST_URL),
+      pageData,
+      "Should return the same data from the cache"
+    );
+
+    delete pageData.date;
+    Assert.deepEqual(
+      pageData,
+      {
+        url: TEST_URL,
+        siteName: "@nytimes",
+        description: "NEWARK - The guest list and parade of limousines",
+        image:
+          "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+        data: {},
+      },
+      "Should have returned the right data"
+    );
+  });
+
+  Assert.equal(
+    PageDataService.getCached(TEST_URL),
+    null,
+    "Data should no longer be cached."
+  );
+
+  let actor = {};
+  PageDataService.lockEntry(actor, TEST_URL);
+
+  promise = PageDataService.once("page-data");
+
+  // Closing a tracked tab shouldn't expire the data here as we have another lock.
+  await BrowserTestUtils.withNewTab(TEST_URL, async () => {
+    await promise;
+  });
+
+  promise = PageDataService.once("page-data");
+
+  // Closing a window with a tracked tab shouldn't expire the data here as we have another lock.
+  await withBrowserInNewWindow(TEST_URL, async () => {
+    await promise;
+  });
+
+  let cached = PageDataService.getCached(TEST_URL);
+  delete cached.date;
+  Assert.deepEqual(
+    cached,
+    {
+      url: TEST_URL,
+      siteName: "@nytimes",
+      description: "NEWARK - The guest list and parade of limousines",
+      image:
+        "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+      data: {},
+    },
+    "Entry should still be cached"
+  );
+
+  PageDataService.unlockEntry(actor, TEST_URL);
+
+  Assert.equal(
+    PageDataService.getCached(TEST_URL),
+    null,
+    "Data should no longer be cached."
+  );
+});
diff --git a/browser/components/pagedata/tests/browser/head.js b/browser/components/pagedata/tests/browser/head.js
new file mode 100644
index 0000000000..b4f57cdb76
--- /dev/null
+++ b/browser/components/pagedata/tests/browser/head.js
@@ -0,0 +1,8 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+ChromeUtils.defineESModuleGetters(this, {
+  PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs",
+  PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+});
diff --git a/browser/components/pagedata/tests/unit/head.js b/browser/components/pagedata/tests/unit/head.js
new file mode 100644
index 0000000000..55b002692b
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/head.js
@@ -0,0 +1,105 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+const { XPCOMUtils } = ChromeUtils.importESModule(
+  "resource://gre/modules/XPCOMUtils.sys.mjs"
+);
+
+ChromeUtils.defineESModuleGetters(this, {
+  PageDataSchema: "resource:///modules/pagedata/PageDataSchema.sys.mjs",
+});
+
+const { HttpServer } = ChromeUtils.importESModule(
+  "resource://testing-common/httpd.sys.mjs"
+);
+
+const server = new HttpServer();
+server.start(-1);
+
+const SERVER_PORT = server.identity.primaryPort;
+const BASE_URL = "http://localhost:" + SERVER_PORT;
+const DEFAULT_PATH = "/document.html";
+const TEST_URL = BASE_URL + DEFAULT_PATH;
+
+registerCleanupFunction(() => {
+  server.stop();
+});
+
+do_get_profile();
+Services.prefs.setBoolPref("browser.pagedata.log", true);
+
+/**
+ * Given a string parses it as HTML into a DOM Document object.
+ *
+ * @param {string} str
+ *   The string to parse.
+ * @param {string} path
+ *   The path for the document on the server, defaults to "/document.html"
+ * @returns {Promise<Document>} the HTML DOM Document object.
+ */
+function parseDocument(str, path = DEFAULT_PATH) {
+  server.registerPathHandler(path, (request, response) => {
+    response.setHeader("Content-Type", "text/html;charset=utf-8");
+
+    let converter = Cc[
+      "@mozilla.org/intl/converter-output-stream;1"
+    ].createInstance(Ci.nsIConverterOutputStream);
+    converter.init(response.bodyOutputStream, "utf-8");
+    converter.writeString(str);
+  });
+
+  return new Promise((resolve, reject) => {
+    let request = new XMLHttpRequest();
+    request.responseType = "document";
+    request.open("GET", BASE_URL + path, true);
+
+    request.addEventListener("error", reject);
+    request.addEventListener("abort", reject);
+
+    request.addEventListener("load", function () {
+      resolve(request.responseXML);
+    });
+
+    request.send();
+  });
+}
+
+/**
+ * Parses page data from a HTML string.
+ *
+ * @param {string} str
+ *   The HTML string to parse.
+ * @param {string} path
+ *   The path for the document on the server, defaults to "/document.html"
+ * @returns {Promise<PageData>} A promise that resolves to the page data found.
+ */
+async function parsePageData(str, path) {
+  let doc = await parseDocument(str, path);
+  return PageDataSchema.collectPageData(doc);
+}
+
+/**
+ * Verifies that the HTML string given parses to the expected page data.
+ *
+ * @param {string} str
+ *   The HTML string to parse.
+ * @param {PageData} expected
+ *   The expected pagedata excluding the date and url properties.
+ * @param {string} path
+ *   The path for the document on the server, defaults to "/document.html"
+ * @returns {Promise<PageData>} A promise that resolves to the page data found.
+ */
+async function verifyPageData(str, expected, path = DEFAULT_PATH) {
+  let pageData = await parsePageData(str, path);
+
+  delete pageData.date;
+
+  Assert.equal(pageData.url, BASE_URL + path);
+  delete pageData.url;
+
+  Assert.deepEqual(
+    pageData,
+    expected,
+    "Should have seen the expected page data."
+  );
+}
diff --git a/browser/components/pagedata/tests/unit/test_opengraph.js b/browser/components/pagedata/tests/unit/test_opengraph.js
new file mode 100644
index 0000000000..e5accaf675
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_opengraph.js
@@ -0,0 +1,67 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse Open Graph metadata.
+ */
+
+add_task(async function test_type_website() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+        <title>Internet for people, not profit — Mozilla</title>
+        <meta property="og:type" content="website">
+        <meta property="og:site_name" content="Mozilla">
+        <meta property="og:url" content="https://www.mozilla.org/">
+        <meta property="og:image" content="https://example.com/preview-image">
+        <meta property="og:title" content="Internet for people, not profit">
+        <!-- We expect the test will ignore tags the parser does not recognize. -->
+        <meta property="og:locale" content="en_CA">
+        <meta property="og:description" content="Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.">
+      </head>
+      <body>
+        <p>Test page</p>
+      </body>
+      </html>
+    `,
+    {
+      siteName: "Mozilla",
+      description:
+        "Mozilla is the not-for-profit behind the lightning fast Firefox browser. We put people over profit to give everyone more power online.",
+      image: "https://example.com/preview-image",
+      data: {},
+    }
+  );
+});
+
+add_task(async function test_type_movie() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+        <title>Code Rush (TV Movie 2000)</title>
+        <meta property="og:url" content="https://www.imdb.com/title/tt0499004/"/>
+        <!-- Omitting og:site_name to test that the parser doesn't break on missing tags. -->
+        <meta property="og:title" content="Code Rush (TV Movie 2000) - IMDb"/>
+        <meta property="og:description" content="This is the description of the movie."/>
+        <meta property="og:type" content="video.movie"/>
+        <meta property="og:image" content="https://example.com/preview-code-rush"/>
+        <meta property="og:image:height" content="750"/>
+        <meta property="og:image:width" content="1000"/>
+      </head>
+      <body>
+        <p>Test page</p>
+      </body>
+      </html>
+    `,
+    {
+      image: "https://example.com/preview-code-rush",
+      description: "This is the description of the movie.",
+      data: {},
+    }
+  );
+});
diff --git a/browser/components/pagedata/tests/unit/test_pagedata_basic.js b/browser/components/pagedata/tests/unit/test_pagedata_basic.js
new file mode 100644
index 0000000000..5d31645a4c
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_pagedata_basic.js
@@ -0,0 +1,100 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+/*
+ * Simply tests that the notification is dispatched when new page data is
+ * discovered.
+ */
+
+ChromeUtils.defineESModuleGetters(this, {
+  PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+});
+
+add_task(async function test_pageDataDiscovered_notifies() {
+  let url = "https://www.mozilla.org/";
+
+  Assert.equal(
+    PageDataService.getCached(url),
+    null,
+    "Should be no cached data."
+  );
+
+  let promise = PageDataService.once("page-data");
+
+  PageDataService.pageDataDiscovered({
+    url,
+    date: 32453456,
+    data: {
+      [PageDataSchema.DATA_TYPE.PRODUCT]: {
+        name: "Bolts",
+        price: { value: 276 },
+      },
+    },
+  });
+
+  let pageData = await promise;
+  Assert.equal(
+    pageData.url,
+    url,
+    "Should have notified data for the expected url"
+  );
+
+  Assert.deepEqual(
+    pageData,
+    {
+      url,
+      date: 32453456,
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bolts",
+          price: { value: 276 },
+        },
+      },
+    },
+    "Should have returned the correct product data"
+  );
+
+  Assert.equal(
+    PageDataService.getCached(url),
+    null,
+    "Should not have cached the data as there was no actor locking."
+  );
+
+  let actor = {};
+  PageDataService.lockEntry(actor, url);
+
+  PageDataService.pageDataDiscovered({
+    url,
+    date: 32453456,
+    data: {
+      [PageDataSchema.DATA_TYPE.PRODUCT]: {
+        name: "Bolts",
+        price: { value: 276 },
+      },
+    },
+  });
+
+  // Should now be in the cache.
+  Assert.deepEqual(
+    PageDataService.getCached(url),
+    {
+      url,
+      date: 32453456,
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bolts",
+          price: { value: 276 },
+        },
+      },
+    },
+    "Should have cached the data"
+  );
+
+  PageDataService.unlockEntry(actor, url);
+
+  Assert.equal(
+    PageDataService.getCached(url),
+    null,
+    "Should have dropped the data from the cache."
+  );
+});
diff --git a/browser/components/pagedata/tests/unit/test_pagedata_schema.js b/browser/components/pagedata/tests/unit/test_pagedata_schema.js
new file mode 100644
index 0000000000..fcd9c4b297
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_pagedata_schema.js
@@ -0,0 +1,210 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+/*
+ * Tests schema validation.
+ */
+
+add_task(async function testBasic() {
+  // Old data types, should not be recognised.
+  Assert.equal(PageDataSchema.nameForType(1), null);
+  Assert.equal(PageDataSchema.nameForType(2), null);
+
+  Assert.equal(
+    PageDataSchema.nameForType(PageDataSchema.DATA_TYPE.VIDEO),
+    "VIDEO"
+  );
+  Assert.equal(
+    PageDataSchema.nameForType(PageDataSchema.DATA_TYPE.PRODUCT),
+    "PRODUCT"
+  );
+});
+
+add_task(async function testProduct() {
+  // Products must have a name
+  await Assert.rejects(
+    PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {}),
+    /missing required property 'name'/
+  );
+
+  await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+    name: "Bolts",
+  });
+
+  await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+    name: "Bolts",
+    price: {
+      value: 5,
+    },
+  });
+
+  await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+    name: "Bolts",
+    price: {
+      value: 5,
+      currency: "USD",
+    },
+  });
+
+  await Assert.rejects(
+    PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+      name: "Bolts",
+      price: {
+        currency: "USD",
+      },
+    }),
+    /missing required property 'value'/
+  );
+
+  await PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+    name: "Bolts",
+    shippingCost: {
+      value: 5,
+      currency: "USD",
+    },
+  });
+
+  await Assert.rejects(
+    PageDataSchema.validateData(PageDataSchema.DATA_TYPE.PRODUCT, {
+      name: "Bolts",
+      shippingCost: {
+        currency: "USD",
+      },
+    }),
+    /missing required property 'value'/
+  );
+});
+
+add_task(async function testCoalesce() {
+  let joined = PageDataSchema.coalescePageData({}, {});
+  Assert.deepEqual(joined, { data: {} });
+
+  joined = PageDataSchema.coalescePageData(
+    {
+      url: "https://www.google.com/",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "bolts",
+        },
+        [PageDataSchema.DATA_TYPE.VIDEO]: {
+          name: "My video",
+          duration: 500,
+        },
+      },
+    },
+    {
+      url: "https://www.mozilla.com/",
+      date: 27,
+      siteName: "Mozilla",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "newname",
+          price: {
+            value: 55,
+          },
+        },
+        [PageDataSchema.DATA_TYPE.AUDIO]: {
+          name: "My song",
+        },
+      },
+    }
+  );
+
+  Assert.deepEqual(joined, {
+    url: "https://www.google.com/",
+    date: 27,
+    siteName: "Mozilla",
+    data: {
+      [PageDataSchema.DATA_TYPE.PRODUCT]: {
+        name: "bolts",
+        price: {
+          value: 55,
+        },
+      },
+      [PageDataSchema.DATA_TYPE.VIDEO]: {
+        name: "My video",
+        duration: 500,
+      },
+      [PageDataSchema.DATA_TYPE.AUDIO]: {
+        name: "My song",
+      },
+    },
+  });
+});
+
+add_task(async function testPageData() {
+  // Full page data needs a url and a date
+  await Assert.rejects(
+    PageDataSchema.validatePageData({}),
+    /missing required property 'url'/
+  );
+
+  await Assert.rejects(
+    PageDataSchema.validatePageData({ url: "https://www.google.com" }),
+    /missing required property 'date'/
+  );
+
+  await Assert.rejects(
+    PageDataSchema.validatePageData({ date: 55 }),
+    /missing required property 'url'/
+  );
+
+  Assert.deepEqual(
+    await PageDataSchema.validatePageData({
+      url: "https://www.google.com",
+      date: 55,
+    }),
+    { url: "https://www.google.com", date: 55, data: {} }
+  );
+
+  Assert.deepEqual(
+    await PageDataSchema.validatePageData({
+      url: "https://www.google.com",
+      date: 55,
+      data: {
+        0: {
+          name: "unknown",
+        },
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bolts",
+          price: {
+            value: 55,
+          },
+        },
+      },
+    }),
+    {
+      url: "https://www.google.com",
+      date: 55,
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bolts",
+          price: {
+            value: 55,
+          },
+        },
+      },
+    }
+  );
+
+  // Should drop invalid inner data.
+  Assert.deepEqual(
+    await PageDataSchema.validatePageData({
+      url: "https://www.google.com",
+      date: 55,
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bolts",
+          price: {
+            currency: "USD",
+          },
+        },
+      },
+    }),
+    {
+      url: "https://www.google.com",
+      date: 55,
+      data: {},
+    }
+  );
+});
diff --git a/browser/components/pagedata/tests/unit/test_queue.js b/browser/components/pagedata/tests/unit/test_queue.js
new file mode 100644
index 0000000000..d683c9a601
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_queue.js
@@ -0,0 +1,512 @@
+/* Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/publicdomain/zero/1.0/ */
+
+ChromeUtils.defineESModuleGetters(this, {
+  PageDataService: "resource:///modules/pagedata/PageDataService.sys.mjs",
+  TestUtils: "resource://testing-common/TestUtils.sys.mjs",
+});
+
+// Test that urls are retrieved in the expected order.
+add_task(async function test_queueOrder() {
+  Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 0);
+  // Pretend we are idle.
+  PageDataService.observe(null, "idle", null);
+
+  let pageDataResults = [
+    {
+      date: Date.now(),
+      url: "http://www.mozilla.org/1",
+      siteName: "Mozilla",
+      data: {},
+    },
+    {
+      date: Date.now() - 3600,
+      url: "http://www.google.com/2",
+      siteName: "Google",
+      data: {},
+    },
+    {
+      date: Date.now() + 3600,
+      url: "http://www.example.com/3",
+      image: "http://www.example.com/banner.jpg",
+      data: {},
+    },
+    {
+      date: Date.now() / 2,
+      url: "http://www.wikipedia.org/4",
+      data: {},
+    },
+    {
+      date: Date.now() / 3,
+      url: "http://www.microsoft.com/5",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Windows 11",
+        },
+      },
+    },
+  ];
+
+  let requests = [];
+  PageDataService.fetchPageData = url => {
+    requests.push(url);
+
+    for (let pageData of pageDataResults) {
+      if (pageData.url == url) {
+        return Promise.resolve(pageData);
+      }
+    }
+
+    return Promise.reject(new Error("Unknown url"));
+  };
+
+  let { promise: completePromise, resolve } = Promise.withResolvers();
+
+  let results = [];
+  let listener = (_, pageData) => {
+    results.push(pageData);
+    if (results.length == pageDataResults.length) {
+      resolve();
+    }
+  };
+
+  PageDataService.on("page-data", listener);
+
+  for (let pageData of pageDataResults) {
+    PageDataService.queueFetch(pageData.url);
+  }
+
+  await completePromise;
+  PageDataService.off("page-data", listener);
+
+  Assert.deepEqual(
+    requests,
+    pageDataResults.map(pd => pd.url)
+  );
+
+  // Because our fetch implementation is essentially synchronous the results
+  // will be in a known order. This isn't guaranteed by the API though.
+  Assert.deepEqual(results, pageDataResults);
+
+  delete PageDataService.fetchPageData;
+});
+
+// Tests that limiting the number of fetches works.
+add_task(async function test_queueLimit() {
+  Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3);
+  // Pretend we are idle.
+  PageDataService.observe(null, "idle", null);
+
+  let requests = [];
+  PageDataService.fetchPageData = url => {
+    let { promise, resolve, reject } = Promise.withResolvers();
+    requests.push({ url, resolve, reject });
+
+    return promise;
+  };
+
+  let results = [];
+  let listener = (_, pageData) => {
+    results.push(pageData?.url);
+  };
+
+  PageDataService.on("page-data", listener);
+
+  PageDataService.queueFetch("https://www.mozilla.org/1");
+  PageDataService.queueFetch("https://www.mozilla.org/2");
+  PageDataService.queueFetch("https://www.mozilla.org/3");
+  PageDataService.queueFetch("https://www.mozilla.org/4");
+  PageDataService.queueFetch("https://www.mozilla.org/5");
+  PageDataService.queueFetch("https://www.mozilla.org/6");
+  PageDataService.queueFetch("https://www.mozilla.org/7");
+  PageDataService.queueFetch("https://www.mozilla.org/8");
+  PageDataService.queueFetch("https://www.mozilla.org/9");
+  PageDataService.queueFetch("https://www.mozilla.org/10");
+  PageDataService.queueFetch("https://www.mozilla.org/11");
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+    ]
+  );
+
+  // Completing or rejecting a request should start new ones.
+
+  requests[1].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/2",
+    siteName: "Test 2",
+    data: {},
+  });
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+    ]
+  );
+
+  requests[3].reject(new Error("Fail"));
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+    ]
+  );
+
+  // Increasing the limit should start more requests.
+  Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 5);
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+    ]
+  );
+
+  // Dropping the limit shouldn't start anything new.
+  Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3);
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+    ]
+  );
+
+  // But resolving should also not start new requests.
+  requests[5].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/6",
+    siteName: "Test 6",
+    data: {},
+  });
+
+  requests[0].resolve({
+    date: 343446434,
+    url: "https://www.mozilla.org/1",
+    siteName: "Test 1",
+    data: {},
+  });
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+    ]
+  );
+
+  // Until a previous request completes.
+  requests[4].resolve(null);
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+      "https://www.mozilla.org/8",
+    ]
+  );
+
+  // Inifinite queue should work.
+  Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 0);
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+      "https://www.mozilla.org/8",
+      "https://www.mozilla.org/9",
+      "https://www.mozilla.org/10",
+      "https://www.mozilla.org/11",
+    ]
+  );
+
+  requests[10].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/11",
+    data: {},
+  });
+  requests[2].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/3",
+    data: {},
+  });
+  requests[7].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/8",
+    data: {},
+  });
+  requests[6].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/7",
+    data: {},
+  });
+  requests[8].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/9",
+    data: {},
+  });
+  requests[9].resolve({
+    date: 345334,
+    url: "https://www.mozilla.org/10",
+    data: {},
+  });
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+      "https://www.mozilla.org/8",
+      "https://www.mozilla.org/9",
+      "https://www.mozilla.org/10",
+      "https://www.mozilla.org/11",
+    ]
+  );
+
+  PageDataService.off("page-data", listener);
+
+  delete PageDataService.fetchPageData;
+
+  Assert.deepEqual(results, [
+    "https://www.mozilla.org/2",
+    "https://www.mozilla.org/6",
+    "https://www.mozilla.org/1",
+    "https://www.mozilla.org/11",
+    "https://www.mozilla.org/3",
+    "https://www.mozilla.org/8",
+    "https://www.mozilla.org/7",
+    "https://www.mozilla.org/9",
+    "https://www.mozilla.org/10",
+  ]);
+});
+
+// Tests that the user idle state stops and starts fetches.
+add_task(async function test_idle() {
+  Services.prefs.setIntPref("browser.pagedata.maxBackgroundFetches", 3);
+  // Pretend we are active.
+  PageDataService.observe(null, "active", null);
+
+  let requests = [];
+  PageDataService.fetchPageData = url => {
+    let { promise, resolve, reject } = Promise.withResolvers();
+    requests.push({ url, resolve, reject });
+
+    return promise;
+  };
+
+  let results = [];
+  let listener = (_, pageData) => {
+    results.push(pageData?.url);
+  };
+
+  PageDataService.on("page-data", listener);
+
+  PageDataService.queueFetch("https://www.mozilla.org/1");
+  PageDataService.queueFetch("https://www.mozilla.org/2");
+  PageDataService.queueFetch("https://www.mozilla.org/3");
+  PageDataService.queueFetch("https://www.mozilla.org/4");
+  PageDataService.queueFetch("https://www.mozilla.org/5");
+  PageDataService.queueFetch("https://www.mozilla.org/6");
+  PageDataService.queueFetch("https://www.mozilla.org/7");
+
+  await TestUtils.waitForTick();
+
+  // Nothing will start when active.
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    []
+  );
+
+  // Pretend we are idle.
+  PageDataService.observe(null, "idle", null);
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+    ]
+  );
+
+  // Completing or rejecting a request should start new ones.
+
+  requests[1].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/2",
+    data: {},
+  });
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+    ]
+  );
+
+  // But not when active
+  PageDataService.observe(null, "active", null);
+
+  requests[3].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/4",
+    data: {},
+  });
+  requests[0].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/1",
+    data: {},
+  });
+  requests[2].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/3",
+    data: {},
+  });
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+    ]
+  );
+
+  // Going idle should start more workers
+  PageDataService.observe(null, "idle", null);
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+    ]
+  );
+
+  requests[4].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/5",
+    data: {},
+  });
+  requests[5].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/6",
+    data: {},
+  });
+  requests[6].resolve({
+    date: 2345,
+    url: "https://www.mozilla.org/7",
+    data: {},
+  });
+
+  await TestUtils.waitForTick();
+
+  Assert.deepEqual(
+    requests.map(r => r.url),
+    [
+      "https://www.mozilla.org/1",
+      "https://www.mozilla.org/2",
+      "https://www.mozilla.org/3",
+      "https://www.mozilla.org/4",
+      "https://www.mozilla.org/5",
+      "https://www.mozilla.org/6",
+      "https://www.mozilla.org/7",
+    ]
+  );
+
+  PageDataService.off("page-data", listener);
+
+  delete PageDataService.fetchPageData;
+
+  Assert.deepEqual(results, [
+    "https://www.mozilla.org/2",
+    "https://www.mozilla.org/4",
+    "https://www.mozilla.org/1",
+    "https://www.mozilla.org/3",
+    "https://www.mozilla.org/5",
+    "https://www.mozilla.org/6",
+    "https://www.mozilla.org/7",
+  ]);
+});
diff --git a/browser/components/pagedata/tests/unit/test_schemaorg.js b/browser/components/pagedata/tests/unit/test_schemaorg.js
new file mode 100644
index 0000000000..5470410e4f
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_schemaorg.js
@@ -0,0 +1,213 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse schema.org metadata into PageData.
+ */
+
+add_task(async function test_single_product_microdata() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <title>Product Info 1</title>
+      </head>
+      <body>
+        <div itemscope itemtype="https://schema.org/Organization">
+          <div itemprop="employee" itemscope itemtype="https://schema.org/Person">
+            <span itemprop="name">Mr. Nested Name</span>
+          </div>
+
+          <span itemprop="name">Mozilla</span>
+        </div>
+
+        <div itemscope itemtype="https://schema.org/Product">
+          <img itemprop="image" src="bon-echo-microwave-17in.jpg" />
+          <a href="microwave.html" itemprop="url">
+            <span itemprop="name">Bon Echo Microwave</span>
+          </a>
+
+          <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
+            <span itemprop="price" content="3.50">£3.50</span>
+            <span itemprop="priceCurrency" content="GBP"></span>
+          </div>
+
+          <span itemprop="gtin" content="13572468"></span>
+
+          <span itemprop="description">The most amazing microwave in the world</span>
+        </div>
+      </body>
+      </html>
+    `,
+    {
+      siteName: "Mozilla",
+      description: "The most amazing microwave in the world",
+      image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bon Echo Microwave",
+          price: {
+            value: 3.5,
+            currency: "GBP",
+          },
+        },
+      },
+    }
+  );
+});
+
+add_task(async function test_single_product_json_ld() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <script type="application/ld+json">
+        {
+          "@context": "http://schema.org",
+          "@type": "Organization",
+          "employee": {
+            "@type": "Person",
+            "name": "Mr. Nested Name"
+          },
+          "name": "Mozilla"
+        }
+      </script>
+      <script type="application/ld+json">
+        {
+          "@context": "https://schema.org",
+          "@type": "Product",
+          "image": "bon-echo-microwave-17in.jpg",
+          "url": "microwave.html",
+          "name": "Bon Echo Microwave",
+          "offers": {
+            "@type": "Offer",
+            "price": "3.50",
+            "priceCurrency": "GBP"
+          },
+          "gtin": "13572468",
+          "description": "The most amazing microwave in the world"
+        }
+      </script>
+      </head>
+      <body>
+      </body>
+      </html>
+    `,
+    {
+      siteName: "Mozilla",
+      description: "The most amazing microwave in the world",
+      image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bon Echo Microwave",
+          price: {
+            value: 3.5,
+            currency: "GBP",
+          },
+        },
+      },
+    }
+  );
+});
+
+add_task(async function test_single_product_combined() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <script type="application/ld+json">
+        {
+          "@context": "https://schema.org",
+          "@type": "Product",
+          "image": "bon-echo-microwave-17in.jpg",
+          "url": "microwave.html",
+          "name": "Bon Echo Microwave",
+          "offers": {
+            "@type": "Offer",
+            "price": "3.50",
+            "priceCurrency": "GBP"
+          },
+          "gtin": "13572468",
+          "description": "The most amazing microwave in the world"
+        }
+      </script>
+      </head>
+      <body>
+        <div itemscope itemtype="https://schema.org/Organization">
+          <div itemprop="employee" itemscope itemtype="https://schema.org/Person">
+            <span itemprop="name">Mr. Nested Name</span>
+          </div>
+
+          <span itemprop="name">Mozilla</span>
+        </div>
+      </body>
+      </html>
+    `,
+    {
+      siteName: "Mozilla",
+      description: "The most amazing microwave in the world",
+      image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bon Echo Microwave",
+          price: {
+            value: 3.5,
+            currency: "GBP",
+          },
+        },
+      },
+    }
+  );
+});
+
+add_task(async function test_single_multiple_microdata() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <title>Product Info 2</title>
+      </head>
+      <body>
+        <div itemscope itemtype="https://schema.org/Product">
+          <img itemprop="image" src="bon-echo-microwave-17in.jpg" />
+          <a href="microwave.html" itemprop="url">
+            <span itemprop="name">Bon Echo Microwave</span>
+          </a>
+
+          <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
+            <span itemprop="price" content="3.28">£3.28</span>
+            <span itemprop="priceCurrency" content="GBP"></span>
+          </div>
+
+          <span itemprop="gtin" content="13572468"></span>
+        </div>
+        <div itemscope itemtype="http://schema.org/Product">
+          <img itemprop="image" src="gran-paradiso-toaster-17in.jpg" />
+          <a href="toaster.html" itemprop="url">
+            <span itemprop="name">Gran Paradiso Toaster</span>
+          </a>
+
+          <span itemprop="gtin" content="15263748"></span>
+        </div>
+      </body>
+      </html>
+    `,
+    {
+      image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+      data: {
+        [PageDataSchema.DATA_TYPE.PRODUCT]: {
+          name: "Bon Echo Microwave",
+          price: {
+            value: 3.28,
+            currency: "GBP",
+          },
+        },
+      },
+    }
+  );
+});
diff --git a/browser/components/pagedata/tests/unit/test_schemaorg_parse.js b/browser/components/pagedata/tests/unit/test_schemaorg_parse.js
new file mode 100644
index 0000000000..e002598af2
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_schemaorg_parse.js
@@ -0,0 +1,193 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Tests that the page data service can parse schema.org metadata into Item
+ * structures.
+ */
+
+const { SchemaOrgPageData } = ChromeUtils.importESModule(
+  "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs"
+);
+
+/**
+ * Collects the schema.org items from the given html string.
+ *
+ * @param {string} docStr
+ *   The html to parse.
+ * @returns {Promise<Item[]>}
+ */
+async function collectItems(docStr) {
+  let doc = await parseDocument(docStr);
+  return SchemaOrgPageData.collectItems(doc);
+}
+
+/**
+ * Verifies that the items parsed from the html match the expected JSON-LD
+ * format.
+ *
+ * @param {string} docStr
+ *   The html to parse.
+ * @param {object[]} expected
+ *   The JSON-LD objects to match to.
+ */
+async function verifyItems(docStr, expected) {
+  let items = await collectItems(docStr);
+  let jsonLD = items.map(item => item.toJsonLD());
+  Assert.deepEqual(jsonLD, expected);
+}
+
+add_task(async function test_microdata_parse() {
+  await verifyItems(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <title>Product Info 1</title>
+      </head>
+      <body itemprop="badprop">
+        <div itemscope itemtype="https://schema.org/Organization">
+          <div itemprop="employee" itemscope itemtype="https://schema.org/Person">
+            <span itemprop="name">Mr. Nested Name</span>
+          </div>
+
+          <span itemprop="name">Mozilla</span>
+        </div>
+
+        <div itemscope itemtype="https://schema.org/Product">
+          <img itemprop="image" src="bon-echo-microwave-17in.jpg" />
+          <a href="microwave.html" itemprop="url">
+            <span itemprop="name">Bon Echo Microwave</span>
+          </a>
+
+          <div itemprop="offers" itemscope itemtype="https://schema.org/Offer">
+            <span itemprop="price" content="3.50">£3.50</span>
+            <span itemprop="priceCurrency" content="GBP"></span>
+          </div>
+
+          <span itemprop="gtin" content="13572468"></span>
+
+          <span itemprop="description">The most amazing microwave in the world</span>
+        </div>
+      </body>
+      </html>
+    `,
+    [
+      {
+        "@type": "Organization",
+        employee: {
+          "@type": "Person",
+          name: "Mr. Nested Name",
+        },
+        name: "Mozilla",
+      },
+      {
+        "@type": "Product",
+        image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+        url: BASE_URL + "/microwave.html",
+        name: "Bon Echo Microwave",
+        offers: {
+          "@type": "Offer",
+          price: "3.50",
+          priceCurrency: "GBP",
+        },
+        gtin: "13572468",
+        description: "The most amazing microwave in the world",
+      },
+    ]
+  );
+});
+
+add_task(async function test_json_ld_parse() {
+  await verifyItems(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <script type="application/ld+json">
+        {
+          "@context": "http://schema.org",
+          "@type": "Organization",
+          "employee": {
+            "@type": "Person",
+            "name": "Mr. Nested Name"
+          },
+          "name": "Mozilla"
+        }
+      </script>
+      <script type="application/ld+json">
+        {
+          "@context": "https://schema.org",
+          "@type": "Product",
+          "image": "bon-echo-microwave-17in.jpg",
+          "url": "microwave.html",
+          "name": "Bon Echo Microwave",
+          "offers": {
+            "@type": "Offer",
+            "price": "3.50",
+            "priceCurrency": "GBP"
+          },
+          "gtin": "13572468",
+          "description": "The most amazing microwave in the world"
+        }
+      </script>
+      </head>
+      <body>
+      </body>
+      </html>
+    `,
+    [
+      {
+        "@type": "Organization",
+        employee: {
+          "@type": "Person",
+          name: "Mr. Nested Name",
+        },
+        name: "Mozilla",
+      },
+      {
+        "@type": "Product",
+        image: "bon-echo-microwave-17in.jpg",
+        url: "microwave.html",
+        name: "Bon Echo Microwave",
+        offers: {
+          "@type": "Offer",
+          price: "3.50",
+          priceCurrency: "GBP",
+        },
+        gtin: "13572468",
+        description: "The most amazing microwave in the world",
+      },
+    ]
+  );
+});
+
+add_task(async function test_microdata_lazy_image() {
+  await verifyItems(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+      <title>Product Info 1</title>
+      </head>
+      <body itemprop="badprop">
+        <div itemscope itemtype="https://schema.org/Product">
+          <img itemprop="image" src="lazy-load.gif" data-src="bon-echo-microwave-17in.jpg" />
+          <a href="microwave.html" itemprop="url">
+            <span itemprop="name">Bon Echo Microwave</span>
+          </a>
+        </div>
+      </body>
+      </html>
+    `,
+    [
+      {
+        "@type": "Product",
+        image: BASE_URL + "/bon-echo-microwave-17in.jpg",
+        url: BASE_URL + "/microwave.html",
+        name: "Bon Echo Microwave",
+      },
+    ]
+  );
+});
diff --git a/browser/components/pagedata/tests/unit/test_twitter.js b/browser/components/pagedata/tests/unit/test_twitter.js
new file mode 100644
index 0000000000..a49491f5c6
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/test_twitter.js
@@ -0,0 +1,34 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * Basic tests for twitter cards.
+ */
+
+add_task(async function test_twitter_card() {
+  await verifyPageData(
+    `
+      <!DOCTYPE html>
+      <html>
+      <head>
+        <meta name="twitter:card" content="summary_large_image">
+        <meta name="twitter:site" content="@nytimes">
+        <meta name="twitter:creator" content="@SarahMaslinNir">
+        <meta name="twitter:title" content="Parade of Fans for Houston’s Funeral">
+        <meta name="twitter:description" content="NEWARK - The guest list and parade of limousines">
+        <meta name="twitter:image" content="http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg">
+      </head>
+      <body>
+      </body>
+      </html>
+    `,
+    {
+      siteName: "@nytimes",
+      description: "NEWARK - The guest list and parade of limousines",
+      image:
+        "http://graphics8.nytimes.com/images/2012/02/19/us/19whitney-span/19whitney-span-articleLarge.jpg",
+      data: {},
+    }
+  );
+});
diff --git a/browser/components/pagedata/tests/unit/xpcshell.toml b/browser/components/pagedata/tests/unit/xpcshell.toml
new file mode 100644
index 0000000000..a04ab47455
--- /dev/null
+++ b/browser/components/pagedata/tests/unit/xpcshell.toml
@@ -0,0 +1,19 @@
+[DEFAULT]
+firefox-appdir = "browser"
+skip-if = ["os == 'android'"] # bug 1730213
+support-files = ["head.js"]
+head = "head.js"
+
+["test_opengraph.js"]
+
+["test_pagedata_basic.js"]
+
+["test_pagedata_schema.js"]
+
+["test_queue.js"]
+
+["test_schemaorg.js"]
+
+["test_schemaorg_parse.js"]
+
+["test_twitter.js"]