summaryrefslogtreecommitdiffstats
path: root/browser/components/search/SearchSERPTelemetry.sys.mjs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /browser/components/search/SearchSERPTelemetry.sys.mjs
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'browser/components/search/SearchSERPTelemetry.sys.mjs')
-rw-r--r--browser/components/search/SearchSERPTelemetry.sys.mjs2515
1 files changed, 2515 insertions, 0 deletions
diff --git a/browser/components/search/SearchSERPTelemetry.sys.mjs b/browser/components/search/SearchSERPTelemetry.sys.mjs
new file mode 100644
index 0000000000..00105241bb
--- /dev/null
+++ b/browser/components/search/SearchSERPTelemetry.sys.mjs
@@ -0,0 +1,2515 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+ChromeUtils.defineESModuleGetters(lazy, {
+ BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs",
+ PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs",
+ Region: "resource://gre/modules/Region.sys.mjs",
+ RemoteSettings: "resource://services-settings/remote-settings.sys.mjs",
+ SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
+});
+
+ChromeUtils.defineLazyGetter(lazy, "gCryptoHash", () => {
+ return Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash);
+});
+
+// The various histograms and scalars that we report to.
+const SEARCH_CONTENT_SCALAR_BASE = "browser.search.content.";
+const SEARCH_WITH_ADS_SCALAR_BASE = "browser.search.withads.";
+const SEARCH_AD_CLICKS_SCALAR_BASE = "browser.search.adclicks.";
+const SEARCH_DATA_TRANSFERRED_SCALAR = "browser.search.data_transferred";
+const SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX = "pb";
+
+// Exported for tests.
+export const ADLINK_CHECK_TIMEOUT_MS = 1000;
+// Unlike the standard adlink check, the timeout for single page apps is not
+// based on a content event within the page, like DOMContentLoaded or load.
+// Thus, we aim for a longer timeout to account for when the server might be
+// slow to update the content on the page.
+export const SPA_ADLINK_CHECK_TIMEOUT_MS = 2500;
+export const TELEMETRY_SETTINGS_KEY = "search-telemetry-v2";
+export const TELEMETRY_CATEGORIZATION_KEY = "search-categorization";
+export const TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS = {
+ // Units are in milliseconds.
+ base: 3600000,
+ minAdjust: 60000,
+ maxAdjust: 600000,
+ maxTriesPerSession: 2,
+};
+
+export const SEARCH_TELEMETRY_SHARED = {
+ PROVIDER_INFO: "SearchTelemetry:ProviderInfo",
+ LOAD_TIMEOUT: "SearchTelemetry:LoadTimeout",
+ SPA_LOAD_TIMEOUT: "SearchTelemetry:SPALoadTimeout",
+};
+
+const impressionIdsWithoutEngagementsSet = new Set();
+
+export const CATEGORIZATION_SETTINGS = {
+ MAX_DOMAINS_TO_CATEGORIZE: 10,
+ MINIMUM_SCORE: 0,
+ STARTING_RANK: 2,
+ IDLE_TIMEOUT_SECONDS: 60 * 60,
+ WAKE_TIMEOUT_MS: 60 * 60 * 1000,
+};
+
+ChromeUtils.defineLazyGetter(lazy, "logConsole", () => {
+ return console.createInstance({
+ prefix: "SearchTelemetry",
+ maxLogLevel: lazy.SearchUtils.loggingEnabled ? "Debug" : "Warn",
+ });
+});
+
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "serpEventsEnabled",
+ "browser.search.serpEventTelemetry.enabled",
+ true
+);
+
+const CATEGORIZATION_PREF =
+ "browser.search.serpEventTelemetryCategorization.enabled";
+
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "serpEventTelemetryCategorization",
+ CATEGORIZATION_PREF,
+ false,
+ (aPreference, previousValue, newValue) => {
+ if (newValue) {
+ SearchSERPDomainToCategoriesMap.init();
+ SearchSERPCategorizationEventScheduler.init();
+ } else {
+ SearchSERPDomainToCategoriesMap.uninit();
+ SearchSERPCategorizationEventScheduler.uninit();
+ }
+ }
+);
+
+export const SearchSERPTelemetryUtils = {
+ ACTIONS: {
+ CLICKED: "clicked",
+ EXPANDED: "expanded",
+ SUBMITTED: "submitted",
+ },
+ COMPONENTS: {
+ AD_CAROUSEL: "ad_carousel",
+ AD_IMAGE_ROW: "ad_image_row",
+ AD_LINK: "ad_link",
+ AD_SIDEBAR: "ad_sidebar",
+ AD_SITELINK: "ad_sitelink",
+ INCONTENT_SEARCHBOX: "incontent_searchbox",
+ NON_ADS_LINK: "non_ads_link",
+ REFINED_SEARCH_BUTTONS: "refined_search_buttons",
+ SHOPPING_TAB: "shopping_tab",
+ },
+ ABANDONMENTS: {
+ NAVIGATION: "navigation",
+ TAB_CLOSE: "tab_close",
+ WINDOW_CLOSE: "window_close",
+ },
+ INCONTENT_SOURCES: {
+ OPENED_IN_NEW_TAB: "opened_in_new_tab",
+ REFINE_ON_SERP: "follow_on_from_refine_on_SERP",
+ SEARCHBOX: "follow_on_from_refine_on_incontent_search",
+ },
+ CATEGORIZATION: {
+ INCONCLUSIVE: 0,
+ },
+};
+
+const AD_COMPONENTS = [
+ SearchSERPTelemetryUtils.COMPONENTS.AD_CAROUSEL,
+ SearchSERPTelemetryUtils.COMPONENTS.AD_IMAGE_ROW,
+ SearchSERPTelemetryUtils.COMPONENTS.AD_LINK,
+ SearchSERPTelemetryUtils.COMPONENTS.AD_SIDEBAR,
+ SearchSERPTelemetryUtils.COMPONENTS.AD_SITELINK,
+];
+
+/**
+ * TelemetryHandler is the main class handling Search Engine Result Page (SERP)
+ * telemetry. It primarily deals with tracking of what pages are loaded into tabs.
+ *
+ * It handles the *in-content:sap* keys of the SEARCH_COUNTS histogram.
+ */
+class TelemetryHandler {
+ // Whether or not this class is initialised.
+ _initialized = false;
+
+ // An instance of ContentHandler.
+ _contentHandler;
+
+ // The original provider information, mainly used for tests.
+ _originalProviderInfo = null;
+
+ // The current search provider info.
+ _searchProviderInfo = null;
+
+ // An instance of remote settings that is used to access the provider info.
+ _telemetrySettings;
+
+ // Callback used when syncing telemetry settings.
+ #telemetrySettingsSync;
+
+ // _browserInfoByURL is a map of tracked search urls to objects containing:
+ // * {object} info
+ // the search provider information associated with the url.
+ // * {WeakMap} browserTelemetryStateMap
+ // a weak map of browsers that have the url loaded, their ad report state,
+ // and their impression id.
+ // * {integer} count
+ // a manual count of browsers logged.
+ // We keep a weak map of browsers, in case we miss something on our counts
+ // and cause a memory leak - worst case our map is slightly bigger than it
+ // needs to be.
+ // The manual count is because WeakMap doesn't give us size/length
+ // information, but we want to know when we can clean up our associated
+ // entry.
+ _browserInfoByURL = new Map();
+
+ // Browser objects mapped to the info in _browserInfoByURL.
+ #browserToItemMap = new WeakMap();
+
+ // _browserSourceMap is a map of the latest search source for a particular
+ // browser - one of the KNOWN_SEARCH_SOURCES in BrowserSearchTelemetry.
+ _browserSourceMap = new WeakMap();
+
+ /**
+ * A WeakMap whose key is a browser with value of a source type found in
+ * INCONTENT_SOURCES. Kept separate to avoid overlapping with legacy
+ * search sources. These sources are specific to the content of a search
+ * provider page rather than something from within the browser itself.
+ */
+ #browserContentSourceMap = new WeakMap();
+
+ /**
+ * Sets the source of a SERP visit from something that occured in content
+ * rather than from the browser.
+ *
+ * @param {browser} browser
+ * The browser object associated with the page that should be a SERP.
+ * @param {string} source
+ * The source that started the load. One of
+ * SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX,
+ * SearchSERPTelemetryUtils.INCONTENT_SOURCES.OPENED_IN_NEW_TAB or
+ * SearchSERPTelemetryUtils.INCONTENT_SOURCES.REFINE_ON_SERP.
+ */
+ setBrowserContentSource(browser, source) {
+ this.#browserContentSourceMap.set(browser, source);
+ }
+
+ // _browserNewtabSessionMap is a map of the newtab session id for particular
+ // browsers.
+ _browserNewtabSessionMap = new WeakMap();
+
+ constructor() {
+ this._contentHandler = new ContentHandler({
+ browserInfoByURL: this._browserInfoByURL,
+ findBrowserItemForURL: (...args) => this._findBrowserItemForURL(...args),
+ checkURLForSerpMatch: (...args) => this._checkURLForSerpMatch(...args),
+ findItemForBrowser: (...args) => this.findItemForBrowser(...args),
+ });
+ }
+
+ /**
+ * Initializes the TelemetryHandler and its ContentHandler. It will add
+ * appropriate listeners to the window so that window opening and closing
+ * can be tracked.
+ */
+ async init() {
+ if (this._initialized) {
+ return;
+ }
+
+ this._telemetrySettings = lazy.RemoteSettings(TELEMETRY_SETTINGS_KEY);
+ let rawProviderInfo = [];
+ try {
+ rawProviderInfo = await this._telemetrySettings.get();
+ } catch (ex) {
+ lazy.logConsole.error("Could not get settings:", ex);
+ }
+
+ this.#telemetrySettingsSync = event => this.#onSettingsSync(event);
+ this._telemetrySettings.on("sync", this.#telemetrySettingsSync);
+
+ // Send the provider info to the child handler.
+ this._contentHandler.init(rawProviderInfo);
+ this._originalProviderInfo = rawProviderInfo;
+
+ // Now convert the regexps into
+ this._setSearchProviderInfo(rawProviderInfo);
+
+ for (let win of Services.wm.getEnumerator("navigator:browser")) {
+ this._registerWindow(win);
+ }
+ Services.wm.addListener(this);
+
+ this._initialized = true;
+ }
+
+ async #onSettingsSync(event) {
+ let current = event.data?.current;
+ if (current) {
+ lazy.logConsole.debug(
+ "Update provider info due to Remote Settings sync."
+ );
+ this._originalProviderInfo = current;
+ this._setSearchProviderInfo(current);
+ Services.ppmm.sharedData.set(
+ SEARCH_TELEMETRY_SHARED.PROVIDER_INFO,
+ current
+ );
+ Services.ppmm.sharedData.flush();
+ } else {
+ lazy.logConsole.debug(
+ "Ignoring Remote Settings sync data due to missing records."
+ );
+ }
+ Services.obs.notifyObservers(null, "search-telemetry-v2-synced");
+ }
+
+ /**
+ * Uninitializes the TelemetryHandler and its ContentHandler.
+ */
+ uninit() {
+ if (!this._initialized) {
+ return;
+ }
+
+ this._contentHandler.uninit();
+
+ for (let win of Services.wm.getEnumerator("navigator:browser")) {
+ this._unregisterWindow(win);
+ }
+ Services.wm.removeListener(this);
+
+ try {
+ this._telemetrySettings.off("sync", this.#telemetrySettingsSync);
+ } catch (ex) {
+ lazy.logConsole.error(
+ "Failed to shutdown SearchSERPTelemetry Remote Settings.",
+ ex
+ );
+ }
+ this._telemetrySettings = null;
+ this.#telemetrySettingsSync = null;
+
+ this._initialized = false;
+ }
+
+ /**
+ * Records the search source for particular browsers, in case it needs
+ * to be associated with a SERP.
+ *
+ * @param {browser} browser
+ * The browser where the search originated.
+ * @param {string} source
+ * Where the search originated from.
+ */
+ recordBrowserSource(browser, source) {
+ this._browserSourceMap.set(browser, source);
+ }
+
+ /**
+ * Records the newtab source for particular browsers, in case it needs
+ * to be associated with a SERP.
+ *
+ * @param {browser} browser
+ * The browser where the search originated.
+ * @param {string} newtabSessionId
+ * The sessionId of the newtab session the search originated from.
+ */
+ recordBrowserNewtabSession(browser, newtabSessionId) {
+ this._browserNewtabSessionMap.set(browser, newtabSessionId);
+ }
+
+ /**
+ * Helper function for recording the reason for a Glean abandonment event.
+ *
+ * @param {string} impressionId
+ * The impression id for the abandonment event about to be recorded.
+ * @param {string} reason
+ * The reason the SERP is deemed abandoned.
+ * One of SearchSERPTelemetryUtils.ABANDONMENTS.
+ */
+ recordAbandonmentTelemetry(impressionId, reason) {
+ impressionIdsWithoutEngagementsSet.delete(impressionId);
+
+ lazy.logConsole.debug(
+ `Recording an abandonment event for impression id ${impressionId} with reason: ${reason}`
+ );
+
+ Glean.serp.abandonment.record({
+ impression_id: impressionId,
+ reason,
+ });
+ }
+
+ /**
+ * Handles the TabClose event received from the listeners.
+ *
+ * @param {object} event
+ * The event object provided by the listener.
+ */
+ handleEvent(event) {
+ if (event.type != "TabClose") {
+ console.error("Received unexpected event type", event.type);
+ return;
+ }
+
+ this._browserNewtabSessionMap.delete(event.target.linkedBrowser);
+ this.stopTrackingBrowser(
+ event.target.linkedBrowser,
+ SearchSERPTelemetryUtils.ABANDONMENTS.TAB_CLOSE
+ );
+ }
+
+ /**
+ * Test-only function, used to override the provider information, so that
+ * unit tests can set it to easy to test values.
+ *
+ * @param {Array} providerInfo
+ * See {@link https://searchfox.org/mozilla-central/search?q=search-telemetry-schema.json}
+ * for type information.
+ */
+ overrideSearchTelemetryForTests(providerInfo) {
+ let info = providerInfo ? providerInfo : this._originalProviderInfo;
+ this._contentHandler.overrideSearchTelemetryForTests(info);
+ this._setSearchProviderInfo(info);
+ }
+
+ /**
+ * Used to set the local version of the search provider information.
+ * This automatically maps the regexps to RegExp objects so that
+ * we don't have to create a new instance each time.
+ *
+ * @param {Array} providerInfo
+ * A raw array of provider information to set.
+ */
+ _setSearchProviderInfo(providerInfo) {
+ this._searchProviderInfo = providerInfo.map(provider => {
+ let newProvider = {
+ ...provider,
+ searchPageRegexp: new RegExp(provider.searchPageRegexp),
+ };
+ if (provider.extraAdServersRegexps) {
+ newProvider.extraAdServersRegexps = provider.extraAdServersRegexps.map(
+ r => new RegExp(r)
+ );
+ }
+
+ newProvider.nonAdsLinkRegexps = provider.nonAdsLinkRegexps?.length
+ ? provider.nonAdsLinkRegexps.map(r => new RegExp(r))
+ : [];
+ if (provider.shoppingTab?.regexp) {
+ newProvider.shoppingTab = {
+ selector: provider.shoppingTab.selector,
+ regexp: new RegExp(provider.shoppingTab.regexp),
+ };
+ }
+ return newProvider;
+ });
+ this._contentHandler._searchProviderInfo = this._searchProviderInfo;
+ }
+
+ reportPageAction(info, browser) {
+ this._contentHandler._reportPageAction(info, browser);
+ }
+
+ reportPageWithAds(info, browser) {
+ this._contentHandler._reportPageWithAds(info, browser);
+ }
+
+ reportPageWithAdImpressions(info, browser) {
+ this._contentHandler._reportPageWithAdImpressions(info, browser);
+ }
+
+ reportPageDomains(info, browser) {
+ this._contentHandler._reportPageDomains(info, browser);
+ }
+
+ reportPageImpression(info, browser) {
+ this._contentHandler._reportPageImpression(info, browser);
+ }
+
+ /**
+ * This may start tracking a tab based on the URL. If the URL matches a search
+ * partner, and it has a code, then we'll start tracking it. This will aid
+ * determining if it is a page we should be tracking for adverts.
+ *
+ * @param {object} browser
+ * The browser associated with the page.
+ * @param {string} url
+ * The url that was loaded in the browser.
+ * @param {nsIDocShell.LoadCommand} loadType
+ * The load type associated with the page load.
+ */
+ updateTrackingStatus(browser, url, loadType) {
+ if (
+ !lazy.BrowserSearchTelemetry.shouldRecordSearchCount(
+ browser.getTabBrowser()
+ )
+ ) {
+ return;
+ }
+ let info = this._checkURLForSerpMatch(url);
+ if (!info) {
+ this._browserNewtabSessionMap.delete(browser);
+ this.stopTrackingBrowser(browser);
+ return;
+ }
+
+ let source = "unknown";
+ if (loadType & Ci.nsIDocShell.LOAD_CMD_RELOAD) {
+ source = "reload";
+ } else if (loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY) {
+ source = "tabhistory";
+ } else if (this._browserSourceMap.has(browser)) {
+ source = this._browserSourceMap.get(browser);
+ this._browserSourceMap.delete(browser);
+ }
+
+ // If it's a SERP but doesn't have a browser source, the source might be
+ // from something that happened in content. We keep this separate from
+ // source because legacy telemetry should not change its reporting.
+ let inContentSource;
+ if (
+ lazy.serpEventsEnabled &&
+ info.hasComponents &&
+ this.#browserContentSourceMap.has(browser)
+ ) {
+ inContentSource = this.#browserContentSourceMap.get(browser);
+ this.#browserContentSourceMap.delete(browser);
+ }
+
+ let newtabSessionId;
+ if (this._browserNewtabSessionMap.has(browser)) {
+ newtabSessionId = this._browserNewtabSessionMap.get(browser);
+ // We leave the newtabSessionId in the map for this browser
+ // until we stop loading SERP pages or the tab is closed.
+ }
+
+ let impressionId;
+ if (lazy.serpEventsEnabled && info.hasComponents) {
+ // The UUID generated by Services.uuid contains leading and trailing braces.
+ // Need to trim them first.
+ impressionId = Services.uuid.generateUUID().toString().slice(1, -1);
+
+ impressionIdsWithoutEngagementsSet.add(impressionId);
+ }
+
+ this._reportSerpPage(info, source, url);
+
+ // For single page apps, we store the page by its original URI so the
+ // network observers can recover the browser in a context when they only
+ // have access to the originURL.
+ let urlKey =
+ info.isSPA && browser.originalURI?.spec ? browser.originalURI.spec : url;
+ let item = this._browserInfoByURL.get(urlKey);
+
+ let impressionInfo;
+ if (lazy.serpEventsEnabled && info.hasComponents) {
+ let partnerCode = "";
+ if (info.code != "none" && info.code != null) {
+ partnerCode = info.code;
+ }
+ impressionInfo = {
+ provider: info.provider,
+ tagged: info.type.startsWith("tagged"),
+ partnerCode,
+ source: inContentSource ?? source,
+ isShoppingPage: info.isShoppingPage,
+ isPrivate: lazy.PrivateBrowsingUtils.isBrowserPrivate(browser),
+ };
+ }
+
+ if (item) {
+ item.browserTelemetryStateMap.set(browser, {
+ adsReported: false,
+ adImpressionsReported: false,
+ impressionId,
+ urlToComponentMap: null,
+ impressionInfo,
+ searchBoxSubmitted: false,
+ categorizationInfo: null,
+ adsClicked: 0,
+ adsVisible: 0,
+ searchQuery: info.searchQuery,
+ });
+ item.count++;
+ item.source = source;
+ item.newtabSessionId = newtabSessionId;
+ } else {
+ item = {
+ browserTelemetryStateMap: new WeakMap().set(browser, {
+ adsReported: false,
+ adImpressionsReported: false,
+ impressionId,
+ urlToComponentMap: null,
+ impressionInfo,
+ searchBoxSubmitted: false,
+ categorizationInfo: null,
+ adsClicked: 0,
+ adsVisible: 0,
+ searchQuery: info.searchQuery,
+ }),
+ info,
+ count: 1,
+ source,
+ newtabSessionId,
+ majorVersion: parseInt(Services.appinfo.version),
+ channel: lazy.SearchUtils.MODIFIED_APP_CHANNEL,
+ region: lazy.Region.home,
+ isSPA: info.isSPA,
+ };
+ // For single page apps, we store the page by its original URI so that
+ // network observers can recover the browser in a context when they only
+ // have the originURL to work with.
+ this._browserInfoByURL.set(urlKey, item);
+ }
+ this.#browserToItemMap.set(browser, item);
+ }
+
+ /**
+ * Determines whether or not a browser should be untracked or tracked for
+ * SERPs who have single page app behaviour.
+ *
+ * The over-arching logic:
+ * 1. Only inspect the browser if the url matches a SERP that is a SPA.
+ * 2. Recording an engagement if we're tracking the browser and we're going
+ * to another page.
+ * 3. Untrack the browser if we're tracking it and switching pages.
+ * 4. Track the browser if we're now on a default search page.
+ *
+ * @param {BrowserElement} browser
+ * The browser element related to the request.
+ * @param {string} url
+ * The url of the request.
+ * @param {number} loadType
+ * The loadtype of a the request.
+ */
+ async updateTrackingSinglePageApp(browser, url, loadType) {
+ let providerInfo = this._getProviderInfoForURL(url);
+ if (!providerInfo?.isSPA) {
+ return;
+ }
+
+ let item = this.findItemForBrowser(browser);
+ let telemetryState = item?.browserTelemetryStateMap.get(browser);
+
+ let previousSearchTerm = telemetryState?.searchQuery ?? "";
+ let searchTerm = this.urlSearchTerms(url, providerInfo);
+ let searchTermChanged = previousSearchTerm !== searchTerm;
+
+ let isSerp = !!this._checkURLForSerpMatch(url, providerInfo);
+ let browserIsTracked = !!telemetryState;
+ let isTabHistory = loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY;
+
+ // Step 2: Maybe record engagement.
+ if (browserIsTracked && !isTabHistory && (searchTermChanged || !isSerp)) {
+ // If we've established we've changed to another SERP, the cause could be
+ // from a submission event inside the content process. The event is
+ // sent to the parent and stored as `telemetryState.searchBoxSubmitted`
+ // but if we check now, it may be too early. Instead, we check with the
+ // content process directly to see if it recorded a submit event.
+ let actor = browser.browsingContext.currentWindowGlobal.getActor(
+ "SearchSERPTelemetry"
+ );
+ let didSubmit = await actor.sendQuery("SearchSERPTelemetry:DidSubmit");
+
+ if (telemetryState && !telemetryState.searchBoxSubmitted && !didSubmit) {
+ impressionIdsWithoutEngagementsSet.delete(telemetryState.impressionId);
+ Glean.serp.engagement.record({
+ impression_id: telemetryState.impressionId,
+ action: SearchSERPTelemetryUtils.ACTIONS.CLICKED,
+ target: SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK,
+ });
+ lazy.logConsole.debug("Counting click:", {
+ impressionId: telemetryState.impressionId,
+ type: SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK,
+ URL: url,
+ });
+ }
+ }
+
+ // Step 3: Maybe untrack the browser.
+ if (browserIsTracked && (searchTermChanged || !isSerp)) {
+ let reason = "";
+ // If we have to untrack it, it might be due to the user using the
+ // back/forward button.
+ if (isTabHistory) {
+ reason = SearchSERPTelemetryUtils.ABANDONMENTS.NAVIGATION;
+ }
+ let actor = browser.browsingContext.currentWindowGlobal.getActor(
+ "SearchSERPTelemetry"
+ );
+ actor.sendAsyncMessage("SearchSERPTelemetry:StopTrackingDocument");
+ this.stopTrackingBrowser(browser, reason);
+ browserIsTracked = false;
+ }
+
+ // Step 4: Maybe track the browser.
+ if (isSerp && !browserIsTracked) {
+ this.updateTrackingStatus(browser, url, loadType);
+ let actor = browser.browsingContext.currentWindowGlobal.getActor(
+ "SearchSERPTelemetry"
+ );
+ actor.sendAsyncMessage("SearchSERPTelemetry:WaitForSPAPageLoad");
+ }
+ }
+
+ /**
+ * Stops tracking of a tab, for example the tab has loaded a different URL.
+ * Also records a Glean abandonment event if appropriate.
+ *
+ * @param {object} browser The browser associated with the tab to stop being
+ * tracked.
+ * @param {string} abandonmentReason
+ * An optional parameter that specifies why the browser is deemed abandoned.
+ * The reason will be recorded as part of Glean abandonment telemetry.
+ * One of SearchSERPTelemetryUtils.ABANDONMENTS.
+ */
+ stopTrackingBrowser(browser, abandonmentReason) {
+ for (let [url, item] of this._browserInfoByURL) {
+ if (item.browserTelemetryStateMap.has(browser)) {
+ let telemetryState = item.browserTelemetryStateMap.get(browser);
+ let impressionId = telemetryState.impressionId;
+ if (impressionIdsWithoutEngagementsSet.has(impressionId)) {
+ this.recordAbandonmentTelemetry(impressionId, abandonmentReason);
+ }
+
+ if (
+ lazy.serpEventTelemetryCategorization &&
+ telemetryState.categorizationInfo
+ ) {
+ SearchSERPCategorizationEventScheduler.sendCallback(browser);
+ }
+
+ item.browserTelemetryStateMap.delete(browser);
+ item.count--;
+ }
+
+ if (!item.count) {
+ this._browserInfoByURL.delete(url);
+ }
+ }
+ this.#browserToItemMap.delete(browser);
+ }
+
+ /**
+ * Calculate how close two urls are in equality.
+ *
+ * The scoring system:
+ * - If the URLs look exactly the same, including the ordering of query
+ * parameters, the score is Infinity.
+ * - If the origin is the same, the score is increased by 1. Otherwise the
+ * score is 0.
+ * - If the path is the same, the score is increased by 1.
+ * - For each query parameter, if the key exists the score is increased by 1.
+ * Likewise if the query parameter values match.
+ * - If the hash is the same, the score is increased by 1. This includes if
+ * the hash is missing in both URLs.
+ *
+ * @param {URL} url1
+ * Url to compare.
+ * @param {URL} url2
+ * Other url to compare. Ordering shouldn't matter.
+ * @param {object} [matchOptions]
+ * Options for checking equality.
+ * @param {boolean} [matchOptions.path]
+ * Whether the path must match. Default to false.
+ * @param {boolean} [matchOptions.paramValues]
+ * Whether the values of the query parameters must match if the query
+ * parameter key exists in the other. Defaults to false.
+ * @returns {number}
+ * A score of how closely the two URLs match. Returns 0 if there is no
+ * match or the equality check failed for an enabled match option.
+ */
+ compareUrls(url1, url2, matchOptions = {}) {
+ // In case of an exact match, well, that's an obvious winner.
+ if (url1.href == url2.href) {
+ return Infinity;
+ }
+
+ // Each step we get closer to the two URLs being the same, we increase the
+ // score. The consumer of this method will use these scores to see which
+ // of the URLs is the best match.
+ let score = 0;
+ if (url1.origin == url2.origin) {
+ ++score;
+ if (url1.pathname == url2.pathname) {
+ ++score;
+ for (let [key1, value1] of url1.searchParams) {
+ // Let's not fuss about the ordering of search params, since the
+ // score effect will solve that.
+ if (url2.searchParams.has(key1)) {
+ ++score;
+ if (url2.searchParams.get(key1) == value1) {
+ ++score;
+ } else if (matchOptions.paramValues) {
+ return 0;
+ }
+ }
+ }
+ if (url1.hash == url2.hash) {
+ ++score;
+ }
+ } else if (matchOptions.path) {
+ return 0;
+ }
+ }
+ return score;
+ }
+
+ /**
+ * Extracts the search terms from the URL based on the provider info.
+ *
+ * @param {string} url
+ * The URL to inspect.
+ * @param {object} providerInfo
+ * The providerInfo associated with the URL.
+ * @returns {string}
+ * The search term or if none is found, a blank string.
+ */
+ urlSearchTerms(url, providerInfo) {
+ if (providerInfo?.queryParamNames?.length) {
+ let { searchParams } = new URL(url);
+ for (let queryParamName of providerInfo.queryParamNames) {
+ let value = searchParams.get(queryParamName);
+ if (value) {
+ return value;
+ }
+ }
+ }
+ return "";
+ }
+
+ findItemForBrowser(browser) {
+ return this.#browserToItemMap.get(browser);
+ }
+
+ /**
+ * Parts of the URL, like search params and hashes, may be mutated by scripts
+ * on a page we're tracking. Since we don't want to keep track of that
+ * ourselves in order to keep the list of browser objects a weak-referenced
+ * set, we do optional fuzzy matching of URLs to fetch the most relevant item
+ * that contains tracking information.
+ *
+ * @param {string} url URL to fetch the tracking data for.
+ * @returns {object} Map containing the following members:
+ * - {WeakMap} browsers
+ * Map of browser elements that belong to `url` and their ad report state.
+ * - {object} info
+ * Info dictionary as returned by `_checkURLForSerpMatch`.
+ * - {number} count
+ * The number of browser element we can most accurately tell we're
+ * tracking, since they're inside a WeakMap.
+ */
+ _findBrowserItemForURL(url) {
+ try {
+ url = new URL(url);
+ } catch (ex) {
+ return null;
+ }
+
+ let item;
+ let currentBestMatch = 0;
+ for (let [trackingURL, candidateItem] of this._browserInfoByURL) {
+ if (currentBestMatch === Infinity) {
+ break;
+ }
+ try {
+ // Make sure to cache the parsed URL object, since there's no reason to
+ // do it twice.
+ trackingURL =
+ candidateItem._trackingURL ||
+ (candidateItem._trackingURL = new URL(trackingURL));
+ } catch (ex) {
+ continue;
+ }
+ let score = this.compareUrls(url, trackingURL);
+ if (score > currentBestMatch) {
+ item = candidateItem;
+ currentBestMatch = score;
+ }
+ }
+
+ return item;
+ }
+
+ // nsIWindowMediatorListener
+
+ /**
+ * This is called when a new window is opened, and handles registration of
+ * that window if it is a browser window.
+ *
+ * @param {nsIAppWindow} appWin The xul window that was opened.
+ */
+ onOpenWindow(appWin) {
+ let win = appWin.docShell.domWindow;
+ win.addEventListener(
+ "load",
+ () => {
+ if (
+ win.document.documentElement.getAttribute("windowtype") !=
+ "navigator:browser"
+ ) {
+ return;
+ }
+
+ this._registerWindow(win);
+ },
+ { once: true }
+ );
+ }
+
+ /**
+ * Listener that is called when a window is closed, and handles deregistration of
+ * that window if it is a browser window.
+ *
+ * @param {nsIAppWindow} appWin The xul window that was closed.
+ */
+ onCloseWindow(appWin) {
+ let win = appWin.docShell.domWindow;
+
+ if (
+ win.document.documentElement.getAttribute("windowtype") !=
+ "navigator:browser"
+ ) {
+ return;
+ }
+
+ this._unregisterWindow(win);
+ }
+
+ /**
+ * Adds event listeners for the window and registers it with the content handler.
+ *
+ * @param {object} win The window to register.
+ */
+ _registerWindow(win) {
+ win.gBrowser.tabContainer.addEventListener("TabClose", this);
+ }
+
+ /**
+ * Removes event listeners for the window and unregisters it with the content
+ * handler.
+ *
+ * @param {object} win The window to unregister.
+ */
+ _unregisterWindow(win) {
+ for (let tab of win.gBrowser.tabs) {
+ this.stopTrackingBrowser(
+ tab.linkedBrowser,
+ SearchSERPTelemetryUtils.ABANDONMENTS.WINDOW_CLOSE
+ );
+ }
+
+ win.gBrowser.tabContainer.removeEventListener("TabClose", this);
+ }
+
+ /**
+ * Searches for provider information for a given url.
+ *
+ * @param {string} url The url to match for a provider.
+ * @returns {Array | null} Returns an array of provider name and the provider information.
+ */
+ _getProviderInfoForURL(url) {
+ return this._searchProviderInfo.find(info =>
+ info.searchPageRegexp.test(url)
+ );
+ }
+
+ /**
+ * Checks to see if a url is a search partner location, and determines the
+ * provider and codes used.
+ *
+ * @param {string} url The url to match.
+ * @returns {null|object} Returns null if there is no match found. Otherwise,
+ * returns an object of strings for provider, code and type.
+ */
+ _checkURLForSerpMatch(url) {
+ let searchProviderInfo = this._getProviderInfoForURL(url);
+ if (!searchProviderInfo) {
+ return null;
+ }
+
+ let queries = new URLSearchParams(url.split("#")[0].split("?")[1]);
+
+ let isSPA = !!searchProviderInfo.isSPA;
+ if (isSPA) {
+ // A URL may have a specific query parameter denoting a search page.
+ // If the key was expected but doesn't currently exist, it could be due to
+ // the initial url containing it until after a page load.
+ // In that case, ignore this check since most SERPs missing the query
+ // param will go to the default search page.
+ let { key, value } = searchProviderInfo.defaultPageQueryParam;
+ if (key && queries.has(key) && queries.get(key) != value) {
+ return null;
+ }
+ }
+
+ // Some URLs can match provider info but also be the provider's homepage
+ // instead of a SERP.
+ // e.g. https://example.com/ vs. https://example.com/?foo=bar
+ // Look for the presence of the query parameter that contains a search term.
+ let hasQuery = false;
+ let searchQuery = "";
+ for (let queryParamName of searchProviderInfo.queryParamNames) {
+ searchQuery = queries.get(queryParamName);
+ if (searchQuery) {
+ hasQuery = true;
+ break;
+ }
+ }
+ if (!hasQuery) {
+ return null;
+ }
+ // Default to organic to simplify things.
+ // We override type in the sap cases.
+ let type = "organic";
+ let code;
+ if (searchProviderInfo.codeParamName) {
+ code = queries.get(searchProviderInfo.codeParamName);
+ if (code) {
+ // The code is only included if it matches one of the specific ones.
+ if (searchProviderInfo.taggedCodes.includes(code)) {
+ type = "tagged";
+ if (
+ searchProviderInfo.followOnParamNames &&
+ searchProviderInfo.followOnParamNames.some(p => queries.has(p))
+ ) {
+ type += "-follow-on";
+ }
+ } else if (searchProviderInfo.organicCodes.includes(code)) {
+ type = "organic";
+ } else if (searchProviderInfo.expectedOrganicCodes?.includes(code)) {
+ code = "none";
+ } else {
+ code = "other";
+ }
+ } else if (searchProviderInfo.followOnCookies) {
+ // Especially Bing requires lots of extra work related to cookies.
+ for (let followOnCookie of searchProviderInfo.followOnCookies) {
+ if (followOnCookie.extraCodeParamName) {
+ let eCode = queries.get(followOnCookie.extraCodeParamName);
+ if (
+ !eCode ||
+ !followOnCookie.extraCodePrefixes.some(p => eCode.startsWith(p))
+ ) {
+ continue;
+ }
+ }
+
+ // If this cookie is present, it's probably an SAP follow-on.
+ // This might be an organic follow-on in the same session, but there
+ // is no way to tell the difference.
+ for (let cookie of Services.cookies.getCookiesFromHost(
+ followOnCookie.host,
+ {}
+ )) {
+ if (cookie.name != followOnCookie.name) {
+ continue;
+ }
+
+ let [cookieParam, cookieValue] = cookie.value
+ .split("=")
+ .map(p => p.trim());
+ if (
+ cookieParam == followOnCookie.codeParamName &&
+ searchProviderInfo.taggedCodes.includes(cookieValue)
+ ) {
+ type = "tagged-follow-on";
+ code = cookieValue;
+ break;
+ }
+ }
+ }
+ }
+ }
+ let isShoppingPage = false;
+ let hasComponents = false;
+ if (lazy.serpEventsEnabled) {
+ if (searchProviderInfo.shoppingTab?.regexp) {
+ isShoppingPage = searchProviderInfo.shoppingTab.regexp.test(url);
+ }
+ if (searchProviderInfo.components?.length) {
+ hasComponents = true;
+ }
+ }
+ return {
+ provider: searchProviderInfo.telemetryId,
+ type,
+ code,
+ isShoppingPage,
+ hasComponents,
+ searchQuery,
+ isSPA,
+ };
+ }
+
+ /**
+ * Logs telemetry for a search provider visit.
+ *
+ * @param {object} info The search provider information.
+ * @param {string} info.provider The name of the provider.
+ * @param {string} info.type The type of search.
+ * @param {string} [info.code] The code for the provider.
+ * @param {string} source Where the search originated from.
+ * @param {string} url The url that was matched (for debug logging only).
+ */
+ _reportSerpPage(info, source, url) {
+ let payload = `${info.provider}:${info.type}:${info.code || "none"}`;
+ Services.telemetry.keyedScalarAdd(
+ SEARCH_CONTENT_SCALAR_BASE + source,
+ payload,
+ 1
+ );
+ lazy.logConsole.debug("Impression:", payload, url);
+ }
+}
+
+/**
+ * ContentHandler deals with handling telemetry of the content within a tab -
+ * when ads detected and when they are selected.
+ */
+class ContentHandler {
+ /**
+ * Constructor.
+ *
+ * @param {object} options
+ * The options for the handler.
+ * @param {Map} options.browserInfoByURL
+ * The map of urls from TelemetryHandler.
+ * @param {Function} options.getProviderInfoForURL
+ * A function that obtains the provider information for a url.
+ */
+ constructor(options) {
+ this._browserInfoByURL = options.browserInfoByURL;
+ this._findBrowserItemForURL = options.findBrowserItemForURL;
+ this._checkURLForSerpMatch = options.checkURLForSerpMatch;
+ this._findItemForBrowser = options.findItemForBrowser;
+ }
+
+ /**
+ * Initializes the content handler. This will also set up the shared data that is
+ * shared with the SearchTelemetryChild actor.
+ *
+ * @param {Array} providerInfo
+ * The provider information for the search telemetry to record.
+ */
+ init(providerInfo) {
+ Services.ppmm.sharedData.set(
+ SEARCH_TELEMETRY_SHARED.PROVIDER_INFO,
+ providerInfo
+ );
+ Services.ppmm.sharedData.set(
+ SEARCH_TELEMETRY_SHARED.LOAD_TIMEOUT,
+ ADLINK_CHECK_TIMEOUT_MS
+ );
+ Services.ppmm.sharedData.set(
+ SEARCH_TELEMETRY_SHARED.SPA_LOAD_TIMEOUT,
+ SPA_ADLINK_CHECK_TIMEOUT_MS
+ );
+
+ Services.obs.addObserver(this, "http-on-examine-response");
+ Services.obs.addObserver(this, "http-on-examine-cached-response");
+ Services.obs.addObserver(this, "http-on-stop-request");
+ }
+
+ /**
+ * Uninitializes the content handler.
+ */
+ uninit() {
+ Services.obs.removeObserver(this, "http-on-examine-response");
+ Services.obs.removeObserver(this, "http-on-examine-cached-response");
+ Services.obs.removeObserver(this, "http-on-stop-request");
+ }
+
+ /**
+ * Test-only function to override the search provider information for use
+ * with tests. Passes it to the SearchTelemetryChild actor.
+ *
+ * @param {object} providerInfo @see SEARCH_PROVIDER_INFO for type information.
+ */
+ overrideSearchTelemetryForTests(providerInfo) {
+ Services.ppmm.sharedData.set("SearchTelemetry:ProviderInfo", providerInfo);
+ }
+
+ /**
+ * Reports bandwidth used by the given channel if it is used by search requests.
+ *
+ * @param {object} aChannel The channel that generated the activity.
+ */
+ _reportChannelBandwidth(aChannel) {
+ if (!(aChannel instanceof Ci.nsIChannel)) {
+ return;
+ }
+ let wrappedChannel = ChannelWrapper.get(aChannel);
+
+ let getTopURL = channel => {
+ // top-level document
+ if (
+ channel.loadInfo &&
+ channel.loadInfo.externalContentPolicyType ==
+ Ci.nsIContentPolicy.TYPE_DOCUMENT
+ ) {
+ return channel.finalURL;
+ }
+
+ // iframe
+ let frameAncestors;
+ try {
+ frameAncestors = channel.frameAncestors;
+ } catch (e) {
+ frameAncestors = null;
+ }
+ if (frameAncestors) {
+ let ancestor = frameAncestors.find(obj => obj.frameId == 0);
+ if (ancestor) {
+ return ancestor.url;
+ }
+ }
+
+ // top-level resource
+ if (channel.loadInfo && channel.loadInfo.loadingPrincipal) {
+ return channel.loadInfo.loadingPrincipal.spec;
+ }
+
+ return null;
+ };
+
+ let topUrl = getTopURL(wrappedChannel);
+ if (!topUrl) {
+ return;
+ }
+
+ let info = this._checkURLForSerpMatch(topUrl);
+ if (!info) {
+ return;
+ }
+
+ let bytesTransferred =
+ wrappedChannel.requestSize + wrappedChannel.responseSize;
+ let { provider } = info;
+
+ let isPrivate =
+ wrappedChannel.loadInfo &&
+ wrappedChannel.loadInfo.originAttributes.privateBrowsingId > 0;
+ if (isPrivate) {
+ provider += `-${SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX}`;
+ }
+
+ Services.telemetry.keyedScalarAdd(
+ SEARCH_DATA_TRANSFERRED_SCALAR,
+ provider,
+ bytesTransferred
+ );
+ }
+
+ observe(aSubject, aTopic, aData) {
+ switch (aTopic) {
+ case "http-on-stop-request":
+ this._reportChannelBandwidth(aSubject);
+ break;
+ case "http-on-examine-response":
+ case "http-on-examine-cached-response":
+ this.observeActivity(aSubject);
+ break;
+ }
+ }
+
+ /**
+ * Listener that observes network activity, so that we can determine if a link
+ * from a search provider page was followed, and if then if that link was an
+ * ad click or not.
+ *
+ * @param {nsIChannel} channel The channel that generated the activity.
+ */
+ observeActivity(channel) {
+ if (!(channel instanceof Ci.nsIChannel)) {
+ return;
+ }
+
+ let wrappedChannel = ChannelWrapper.get(channel);
+ // The channel we're observing might be a redirect of a channel we've
+ // observed before.
+ if (wrappedChannel._adClickRecorded) {
+ lazy.logConsole.debug("Ad click already recorded");
+ return;
+ }
+
+ Services.tm.dispatchToMainThread(() => {
+ // We suspect that No Content (204) responses are used to transfer or
+ // update beacons. They used to lead to double-counting ad-clicks, so let's
+ // ignore them.
+ if (wrappedChannel.statusCode == 204) {
+ lazy.logConsole.debug("Ignoring activity from ambiguous responses");
+ return;
+ }
+
+ // The wrapper is consistent across redirects, so we can use it to track state.
+ let originURL = wrappedChannel.originURI && wrappedChannel.originURI.spec;
+ let item = this._findBrowserItemForURL(originURL);
+ if (!originURL || !item) {
+ return;
+ }
+
+ let url = wrappedChannel.finalURL;
+
+ let providerInfo = item.info.provider;
+ let info = this._searchProviderInfo.find(provider => {
+ return provider.telemetryId == providerInfo;
+ });
+
+ // If an error occurs with Glean SERP telemetry logic, avoid
+ // disrupting legacy telemetry.
+ try {
+ this.#maybeRecordSERPTelemetry(wrappedChannel, item, info);
+ } catch (ex) {
+ lazy.logConsole.error(ex);
+ }
+
+ if (!info.extraAdServersRegexps?.some(regex => regex.test(url))) {
+ return;
+ }
+
+ try {
+ Services.telemetry.keyedScalarAdd(
+ SEARCH_AD_CLICKS_SCALAR_BASE + item.source,
+ `${info.telemetryId}:${item.info.type}`,
+ 1
+ );
+ wrappedChannel._adClickRecorded = true;
+ if (item.newtabSessionId) {
+ Glean.newtabSearchAd.click.record({
+ newtab_visit_id: item.newtabSessionId,
+ search_access_point: item.source,
+ is_follow_on: item.info.type.endsWith("follow-on"),
+ is_tagged: item.info.type.startsWith("tagged"),
+ telemetry_id: item.info.provider,
+ });
+ }
+
+ lazy.logConsole.debug("Counting ad click in page for:", {
+ source: item.source,
+ originURL,
+ URL: url,
+ });
+ } catch (e) {
+ console.error(e);
+ }
+ });
+ }
+
+ /**
+ * Checks if a request should record an ad click if it can be traced to a
+ * browser containing an observed SERP.
+ *
+ * @param {ChannelWrapper} wrappedChannel
+ * The wrapped channel.
+ * @param {object} item
+ * The browser item associated with the origin URL of the request.
+ * @param {object} info
+ * The search provider info associated with the item.
+ */
+ #maybeRecordSERPTelemetry(wrappedChannel, item, info) {
+ if (!lazy.serpEventsEnabled) {
+ return;
+ }
+
+ if (wrappedChannel._recordedClick) {
+ lazy.logConsole.debug("Click already recorded.");
+ return;
+ }
+
+ let originURL = wrappedChannel.originURI?.spec;
+ let url = wrappedChannel.finalURL;
+ // Some channels re-direct by loading pages that return 200. The result
+ // is the channel will have an originURL that changes from the SERP to
+ // either a nonAdsRegexp or an extraAdServersRegexps. This is typical
+ // for loading a page in a new tab. The channel will have changed so any
+ // properties attached to them to record state (e.g. _recordedClick)
+ // won't be present.
+ if (
+ info.nonAdsLinkRegexps.some(r => r.test(originURL)) ||
+ info.extraAdServersRegexps.some(r => r.test(originURL))
+ ) {
+ return;
+ }
+
+ // A click event is recorded if a user loads a resource from an
+ // originURL that is a SERP.
+ //
+ // Typically, we only want top level loads containing documents to avoid
+ // recording any event on an in-page resource a SERP might load
+ // (e.g. CSS files).
+ //
+ // The exception to this is if a subframe loads a resource that matches
+ // a non ad link. Some SERPs encode non ad search results with a URL
+ // that gets loaded into an iframe, which then tells the container of
+ // the iframe to change the location of the page.
+ if (
+ wrappedChannel.channel.isDocument &&
+ (wrappedChannel.channel.loadInfo.isTopLevelLoad ||
+ info.nonAdsLinkRegexps.some(r => r.test(url)))
+ ) {
+ let browser = wrappedChannel.browserElement;
+
+ // If the load is from history, don't record an event.
+ if (
+ browser?.browsingContext.webProgress?.loadType &
+ Ci.nsIDocShell.LOAD_CMD_HISTORY
+ ) {
+ lazy.logConsole.debug("Ignoring load from history");
+ return;
+ }
+
+ // Step 1: Check if the browser associated with the request was a
+ // tracked SERP.
+ let start = Cu.now();
+ let telemetryState;
+ let isFromNewtab = false;
+ if (item.browserTelemetryStateMap.has(browser)) {
+ // If the map contains the browser, then it means that the request is
+ // the SERP is going from one page to another. We know this because
+ // previous conditions prevent non-top level loads from occuring here.
+ telemetryState = item.browserTelemetryStateMap.get(browser);
+ } else if (browser) {
+ // Alternatively, it could be the case that the request is occuring in
+ // a new tab but was triggered by one of the browsers in the state map.
+ // If only one browser exists in the state map, it must be that one.
+ if (item.count === 1) {
+ let sourceBrowsers = ChromeUtils.nondeterministicGetWeakMapKeys(
+ item.browserTelemetryStateMap
+ );
+ if (sourceBrowsers?.length) {
+ telemetryState = item.browserTelemetryStateMap.get(
+ sourceBrowsers[0]
+ );
+ }
+ } else if (item.count > 1) {
+ // If the count is more than 1, then multiple open SERPs contain the
+ // same search term, so try to find the specific browser that opened
+ // the request.
+ let tabBrowser = browser.getTabBrowser();
+ let tab = tabBrowser.getTabForBrowser(browser).openerTab;
+ // A tab will not always have an openerTab, as first tabs in new
+ // windows don't have an openerTab.
+ // Bug 1867582: We should also handle the case where multiple tabs
+ // contain the same search term.
+ if (tab) {
+ telemetryState = item.browserTelemetryStateMap.get(
+ tab.linkedBrowser
+ );
+ }
+ }
+ if (telemetryState) {
+ isFromNewtab = true;
+ }
+ }
+
+ // Step 2: If we have telemetryState, the browser object must be
+ // associated with another browser that is tracked. Try to find the
+ // component type on the SERP responsible for the request.
+ // Exceptions:
+ // - If a searchbox was used to initiate the load, don't record another
+ // engagement because the event was logged elsewhere.
+ // - If the ad impression hasn't been recorded yet, we have no way of
+ // knowing precisely what kind of component was selected.
+ let isSerp = false;
+ if (
+ telemetryState &&
+ telemetryState.adImpressionsReported &&
+ !telemetryState.searchBoxSubmitted
+ ) {
+ if (info.searchPageRegexp?.test(originURL)) {
+ isSerp = true;
+ }
+
+ let startFindComponent = Cu.now();
+ let parsedUrl = new URL(url);
+ // Determine the component type of the link.
+ let type;
+ for (let [
+ storedUrl,
+ componentType,
+ ] of telemetryState.urlToComponentMap.entries()) {
+ // The URL we're navigating to may have more query parameters if
+ // the provider adds query parameters when the user clicks on a link.
+ // On the other hand, the URL we are navigating to may have have
+ // fewer query parameters because of query param stripping.
+ // Thus, if a query parameter is missing, a match can still be made
+ // provided keys that exist in both URLs contain equal values.
+ let score = SearchSERPTelemetry.compareUrls(storedUrl, parsedUrl, {
+ paramValues: true,
+ path: true,
+ });
+ if (score) {
+ type = componentType;
+ break;
+ }
+ }
+ ChromeUtils.addProfilerMarker(
+ "SearchSERPTelemetry._observeActivity",
+ startFindComponent,
+ "Find component for URL"
+ );
+
+ // Default value for URLs that don't match any components categorized
+ // on the page.
+ if (!type) {
+ type = SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK;
+ }
+
+ if (
+ type == SearchSERPTelemetryUtils.COMPONENTS.REFINED_SEARCH_BUTTONS
+ ) {
+ SearchSERPTelemetry.setBrowserContentSource(
+ browser,
+ SearchSERPTelemetryUtils.INCONTENT_SOURCES.REFINE_ON_SERP
+ );
+ } else if (isSerp && isFromNewtab) {
+ SearchSERPTelemetry.setBrowserContentSource(
+ browser,
+ SearchSERPTelemetryUtils.INCONTENT_SOURCES.OPENED_IN_NEW_TAB
+ );
+ }
+
+ // Step 3: Record the engagement.
+ impressionIdsWithoutEngagementsSet.delete(telemetryState.impressionId);
+ if (AD_COMPONENTS.includes(type)) {
+ telemetryState.adsClicked += 1;
+ }
+ Glean.serp.engagement.record({
+ impression_id: telemetryState.impressionId,
+ action: SearchSERPTelemetryUtils.ACTIONS.CLICKED,
+ target: type,
+ });
+ lazy.logConsole.debug("Counting click:", {
+ impressionId: telemetryState.impressionId,
+ type,
+ URL: url,
+ });
+ // Prevent re-directed channels from being examined more than once.
+ wrappedChannel._recordedClick = true;
+ }
+ ChromeUtils.addProfilerMarker(
+ "SearchSERPTelemetry._observeActivity",
+ start,
+ "Maybe record user engagement."
+ );
+ }
+ }
+
+ /**
+ * Logs telemetry for a page with adverts, if it is one of the partner search
+ * provider pages that we're tracking.
+ *
+ * @param {object} info
+ * The search provider information for the page.
+ * @param {boolean} info.hasAds
+ * Whether or not the page has adverts.
+ * @param {string} info.url
+ * The url of the page.
+ * @param {object} browser
+ * The browser associated with the page.
+ */
+ _reportPageWithAds(info, browser) {
+ let item = this._findItemForBrowser(browser);
+ if (!item) {
+ lazy.logConsole.warn(
+ "Expected to report URI for",
+ info.url,
+ "with ads but couldn't find the information"
+ );
+ return;
+ }
+
+ let telemetryState = item.browserTelemetryStateMap.get(browser);
+ if (telemetryState.adsReported) {
+ lazy.logConsole.debug(
+ "Ad was previously reported for browser with URI",
+ info.url
+ );
+ return;
+ }
+
+ lazy.logConsole.debug(
+ "Counting ads in page for",
+ item.info.provider,
+ item.info.type,
+ item.source,
+ info.url
+ );
+ Services.telemetry.keyedScalarAdd(
+ SEARCH_WITH_ADS_SCALAR_BASE + item.source,
+ `${item.info.provider}:${item.info.type}`,
+ 1
+ );
+ Services.obs.notifyObservers(null, "reported-page-with-ads");
+
+ telemetryState.adsReported = true;
+
+ if (item.newtabSessionId) {
+ Glean.newtabSearchAd.impression.record({
+ newtab_visit_id: item.newtabSessionId,
+ search_access_point: item.source,
+ is_follow_on: item.info.type.endsWith("follow-on"),
+ is_tagged: item.info.type.startsWith("tagged"),
+ telemetry_id: item.info.provider,
+ });
+ }
+ }
+
+ /**
+ * Logs ad impression telemetry for a page with adverts, if it is
+ * one of the partner search provider pages that we're tracking.
+ *
+ * @param {object} info
+ * The search provider information for the page.
+ * @param {string} info.url
+ * The url of the page.
+ * @param {Map<string, object>} info.adImpressions
+ * A map of ad impressions found for the page, where the key
+ * is the type of ad component and the value is an object
+ * containing the number of ads that were loaded, visible,
+ * and hidden.
+ * @param {Map<string, string>} info.hrefToComponentMap
+ * A map of hrefs to their component type. Contains both ads
+ * and non-ads.
+ * @param {object} browser
+ * The browser associated with the page.
+ */
+ _reportPageWithAdImpressions(info, browser) {
+ let item = this._findItemForBrowser(browser);
+ if (!item) {
+ return;
+ }
+ let telemetryState = item.browserTelemetryStateMap.get(browser);
+ if (
+ lazy.serpEventsEnabled &&
+ info.adImpressions &&
+ telemetryState &&
+ !telemetryState.adImpressionsReported
+ ) {
+ for (let [componentType, data] of info.adImpressions.entries()) {
+ telemetryState.adsVisible += data.adsVisible;
+
+ lazy.logConsole.debug("Counting ad:", { type: componentType, ...data });
+ Glean.serp.adImpression.record({
+ impression_id: telemetryState.impressionId,
+ component: componentType,
+ ads_loaded: data.adsLoaded,
+ ads_visible: data.adsVisible,
+ ads_hidden: data.adsHidden,
+ });
+ }
+ // Convert hrefToComponentMap to a urlToComponentMap in order to cache
+ // the query parameters of the href.
+ let urlToComponentMap = new Map();
+ for (let [href, adType] of info.hrefToComponentMap) {
+ urlToComponentMap.set(new URL(href), adType);
+ }
+ telemetryState.urlToComponentMap = urlToComponentMap;
+ telemetryState.adImpressionsReported = true;
+ Services.obs.notifyObservers(null, "reported-page-with-ad-impressions");
+ }
+ }
+
+ /**
+ * Records a page action from a SERP page. Normally, actions are tracked in
+ * parent process by observing network events but some actions are not
+ * possible to detect outside of subscribing to the child process.
+ *
+ * @param {object} info
+ * The search provider infomation for the page.
+ * @param {string} info.type
+ * The component type that was clicked on.
+ * @param {string} info.action
+ * The action taken on the page.
+ * @param {object} browser
+ * The browser associated with the page.
+ */
+ _reportPageAction(info, browser) {
+ let item = this._findItemForBrowser(browser);
+ if (!item) {
+ return;
+ }
+ let telemetryState = item.browserTelemetryStateMap.get(browser);
+ let impressionId = telemetryState?.impressionId;
+ if (info.type && impressionId) {
+ lazy.logConsole.debug(`Recorded page action:`, {
+ impressionId: telemetryState.impressionId,
+ type: info.type,
+ action: info.action,
+ });
+ Glean.serp.engagement.record({
+ impression_id: impressionId,
+ action: info.action,
+ target: info.type,
+ });
+ impressionIdsWithoutEngagementsSet.delete(impressionId);
+ // In-content searches are not be categorized with a type, so they will
+ // not be picked up in the network processes.
+ if (
+ info.type == SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX &&
+ info.action == SearchSERPTelemetryUtils.ACTIONS.SUBMITTED
+ ) {
+ telemetryState.searchBoxSubmitted = true;
+ SearchSERPTelemetry.setBrowserContentSource(
+ browser,
+ SearchSERPTelemetryUtils.INCONTENT_SOURCES.SEARCHBOX
+ );
+ }
+ } else {
+ lazy.logConsole.warn(
+ "Expected to report a",
+ info.action,
+ "engagement for",
+ info.url,
+ "but couldn't find an impression id."
+ );
+ }
+ }
+
+ _reportPageImpression(info, browser) {
+ let item = this._findItemForBrowser(browser);
+ let telemetryState = item.browserTelemetryStateMap.get(browser);
+ if (!telemetryState?.impressionInfo) {
+ lazy.logConsole.debug(
+ "Could not find telemetry state or impression info."
+ );
+ return;
+ }
+ let impressionId = telemetryState.impressionId;
+ if (impressionId) {
+ let impressionInfo = telemetryState.impressionInfo;
+ Glean.serp.impression.record({
+ impression_id: impressionId,
+ provider: impressionInfo.provider,
+ tagged: impressionInfo.tagged,
+ partner_code: impressionInfo.partnerCode,
+ source: impressionInfo.source,
+ shopping_tab_displayed: info.shoppingTabDisplayed,
+ is_shopping_page: impressionInfo.isShoppingPage,
+ is_private: impressionInfo.isPrivate,
+ });
+ lazy.logConsole.debug(`Reported Impression:`, {
+ impressionId,
+ ...impressionInfo,
+ shoppingTabDisplayed: info.shoppingTabDisplayed,
+ });
+ Services.obs.notifyObservers(null, "reported-page-with-impression");
+ } else {
+ lazy.logConsole.debug("Could not find an impression id.");
+ }
+ }
+
+ /**
+ * Initiates the categorization and reporting of domains extracted from
+ * SERPs.
+ *
+ * @param {object} info
+ * The search provider infomation for the page.
+ * @param {Set} info.nonAdDomains
+ The non-ad domains extracted from the page.
+ * @param {Set} info.adDomains
+ The ad domains extracted from the page.
+ * @param {object} browser
+ * The browser associated with the page.
+ */
+ _reportPageDomains(info, browser) {
+ let item = this._findItemForBrowser(browser);
+ let telemetryState = item.browserTelemetryStateMap.get(browser);
+ if (lazy.serpEventTelemetryCategorization && telemetryState) {
+ let result = SearchSERPCategorization.maybeCategorizeSERP(
+ info.nonAdDomains,
+ info.adDomains,
+ item.info.provider
+ );
+ if (result) {
+ telemetryState.categorizationInfo = result;
+ let callback = () => {
+ let impressionInfo = telemetryState.impressionInfo;
+ SERPCategorizationRecorder.recordCategorizationTelemetry({
+ ...telemetryState.categorizationInfo,
+ app_version: item.majorVersion,
+ channel: item.channel,
+ region: item.region,
+ partner_code: impressionInfo.partnerCode,
+ provider: impressionInfo.provider,
+ tagged: impressionInfo.tagged,
+ num_ads_clicked: telemetryState.adsClicked,
+ num_ads_visible: telemetryState.adsVisible,
+ });
+ };
+ SearchSERPCategorizationEventScheduler.addCallback(browser, callback);
+ }
+ }
+ Services.obs.notifyObservers(
+ null,
+ "reported-page-with-categorized-domains"
+ );
+ }
+}
+
+/**
+ * @typedef {object} CategorizationResult
+ * @property {string} organic_category
+ * The category for the organic result.
+ * @property {number} organic_num_domains
+ * The number of domains examined to determine the organic category result.
+ * @property {number} organic_num_inconclusive
+ * The number of inconclusive domains when determining the organic result.
+ * @property {number} organic_num_unknown
+ * The number of unknown domains when determining the organic result.
+ * @property {string} sponsored_category
+ * The category for the organic result.
+ * @property {number} sponsored_num_domains
+ * The number of domains examined to determine the sponsored category.
+ * @property {number} sponsored_num_inconclusive
+ * The number of inconclusive domains when determining the sponsored category.
+ * @property {number} sponsored_num_unknown
+ * The category for the sponsored result.
+ * @property {string} mappings_version
+ * The category mapping version used to determine the categories.
+ */
+
+/**
+ * @typedef {object} CategorizationExtraParams
+ * @property {number} num_ads_clicked
+ * The total number of ads clicked on a SERP.
+ * @property {number} num_ads_visible
+ * The total number of ads visible to the user when categorization occured.
+ */
+
+/* eslint-disable jsdoc/valid-types */
+/**
+ * @typedef {CategorizationResult & CategorizationExtraParams} RecordCategorizationParameters
+ */
+/* eslint-enable jsdoc/valid-types */
+
+/**
+ * Categorizes SERPs.
+ */
+class SERPCategorizer {
+ /**
+ * Categorizes domains extracted from SERPs. Note that we don't process
+ * domains if the domain-to-categories map is empty (if the client couldn't
+ * download Remote Settings attachments, for example).
+ *
+ * @param {Set} nonAdDomains
+ * Domains from organic results extracted from the page.
+ * @param {Set} adDomains
+ * Domains from ad results extracted from the page.
+ * @param {string} provider
+ * The provider associated with the page.
+ * @returns {CategorizationResult | null}
+ * The final categorization result. Returns null if the map was empty.
+ */
+ maybeCategorizeSERP(nonAdDomains, adDomains, provider) {
+ // Per DS, if the map was empty (e.g. because of a technical issue
+ // downloading the data), we shouldn't report telemetry.
+ // Thus, there is no point attempting to categorize the SERP.
+ if (SearchSERPDomainToCategoriesMap.empty) {
+ return null;
+ }
+ let resultsToReport = {};
+
+ let processedDomains = this.processDomains(nonAdDomains, provider);
+ let results = this.applyCategorizationLogic(processedDomains);
+ resultsToReport.organic_category = results.category;
+ resultsToReport.organic_num_domains = results.num_domains;
+ resultsToReport.organic_num_unknown = results.num_unknown;
+ resultsToReport.organic_num_inconclusive = results.num_inconclusive;
+
+ processedDomains = this.processDomains(adDomains, provider);
+ results = this.applyCategorizationLogic(processedDomains);
+ resultsToReport.sponsored_category = results.category;
+ resultsToReport.sponsored_num_domains = results.num_domains;
+ resultsToReport.sponsored_num_unknown = results.num_unknown;
+ resultsToReport.sponsored_num_inconclusive = results.num_inconclusive;
+
+ resultsToReport.mappings_version = SearchSERPDomainToCategoriesMap.version;
+
+ return resultsToReport;
+ }
+
+ /**
+ * Applies the logic for reducing extracted domains to a single category for
+ * the SERP.
+ *
+ * @param {Set} domains
+ * The domains extracted from the page.
+ * @returns {object} resultsToReport
+ * The final categorization results. Keys are: "category", "num_domains",
+ * "num_unknown" and "num_inconclusive".
+ */
+ applyCategorizationLogic(domains) {
+ let domainInfo = {};
+ let domainsCount = 0;
+ let unknownsCount = 0;
+ let inconclusivesCount = 0;
+
+ // Per a request from Data Science, we need to limit the number of domains
+ // categorized to 10 non-ad domains and 10 ad domains.
+ domains = new Set(
+ [...domains].slice(0, CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE)
+ );
+
+ for (let domain of domains) {
+ domainsCount++;
+
+ let categoryCandidates = SearchSERPDomainToCategoriesMap.get(domain);
+
+ if (!categoryCandidates.length) {
+ unknownsCount++;
+ continue;
+ }
+
+ // Inconclusive domains do not have more than one category candidate.
+ if (
+ categoryCandidates[0].category ==
+ SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE
+ ) {
+ inconclusivesCount++;
+ continue;
+ }
+
+ domainInfo[domain] = categoryCandidates;
+ }
+
+ let finalCategory;
+ let topCategories = [];
+ // Determine if all domains were unknown or inconclusive.
+ if (unknownsCount + inconclusivesCount == domainsCount) {
+ finalCategory = SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE;
+ } else {
+ let maxScore = CATEGORIZATION_SETTINGS.MINIMUM_SCORE;
+ let rank = CATEGORIZATION_SETTINGS.STARTING_RANK;
+ for (let categoryCandidates of Object.values(domainInfo)) {
+ for (let { category, score } of categoryCandidates) {
+ let adjustedScore = score / Math.log2(rank);
+ if (adjustedScore > maxScore) {
+ maxScore = adjustedScore;
+ topCategories = [category];
+ } else if (adjustedScore == maxScore) {
+ topCategories.push(Number(category));
+ }
+ rank++;
+ }
+ }
+ finalCategory =
+ topCategories.length > 1
+ ? this.#chooseRandomlyFrom(topCategories)
+ : topCategories[0];
+ }
+
+ return {
+ category: finalCategory,
+ num_domains: domainsCount,
+ num_unknown: unknownsCount,
+ num_inconclusive: inconclusivesCount,
+ };
+ }
+
+ /**
+ * Processes raw domains extracted from the SERP into their final form before
+ * categorization.
+ *
+ * @param {Set} domains
+ * The domains extracted from the page.
+ * @param {string} provider
+ * The provider associated with the page.
+ * @returns {Set} processedDomains
+ * The final set of processed domains for a page.
+ */
+ processDomains(domains, provider) {
+ let processedDomains = new Set();
+
+ for (let domain of domains) {
+ // Don't include domains associated with the search provider.
+ if (
+ domain.startsWith(`${provider}.`) ||
+ domain.includes(`.${provider}.`)
+ ) {
+ continue;
+ }
+ let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain);
+ // We may have come across the same domain twice, once with www. prefixed
+ // and another time without.
+ if (
+ domainWithoutSubdomains &&
+ !processedDomains.has(domainWithoutSubdomains)
+ ) {
+ processedDomains.add(domainWithoutSubdomains);
+ }
+ }
+
+ return processedDomains;
+ }
+
+ /**
+ * Helper to strip domains of any subdomains.
+ *
+ * @param {string} domain
+ * The domain to strip of any subdomains.
+ * @returns {object} browser
+ * The given domain with any subdomains removed.
+ */
+ #stripDomainOfSubdomains(domain) {
+ let tld;
+ // Can throw an exception if the input has too few domain levels.
+ try {
+ tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
+ } catch (ex) {
+ return "";
+ }
+
+ let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
+ let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
+
+ return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
+ }
+
+ #chooseRandomlyFrom(categories) {
+ let randIdx = Math.floor(Math.random() * categories.length);
+ return categories[randIdx];
+ }
+}
+
+/**
+ * Contains outstanding categorizations of browser objects that have yet to be
+ * scheduled to be reported into a Glean event.
+ * They are kept here until one of the conditions are met:
+ * 1. The browser that was tracked is no longer being tracked.
+ * 2. A user has been idle for IDLE_TIMEOUT_SECONDS
+ * 3. The user has awoken their computer and the time elapsed from the last
+ * categorization event exceeds WAKE_TIMEOUT_MS.
+ */
+class CategorizationEventScheduler {
+ /**
+ * A WeakMap containing browser objects mapped to a callback.
+ *
+ * @type {WeakMap | null}
+ */
+ #browserToCallbackMap = null;
+
+ /**
+ * An instance of user idle service. Cached for testing purposes.
+ *
+ * @type {nsIUserIdleService | null}
+ */
+ #idleService = null;
+
+ /**
+ * Whether it has been initialized.
+ *
+ * @type {boolean}
+ */
+ #init = false;
+
+ /**
+ * The last Date.now() of a callback insertion.
+ *
+ * @type {number | null}
+ */
+ #mostRecentMs = null;
+
+ constructor() {
+ this.init();
+ }
+
+ init() {
+ if (!lazy.serpEventTelemetryCategorization || this.#init) {
+ return;
+ }
+
+ lazy.logConsole.debug("Initializing categorization event scheduler.");
+
+ this.#browserToCallbackMap = new WeakMap();
+
+ // In tests, we simulate idleness as it is more reliable and easier than
+ // trying to replicate idleness. The way to do is so it by creating
+ // an mock idle service and having the component subscribe to it. If we
+ // used a lazy instantiation of idle service, the test could only ever be
+ // subscribed to the real one.
+ this.#idleService = Cc["@mozilla.org/widget/useridleservice;1"].getService(
+ Ci.nsIUserIdleService
+ );
+
+ this.#idleService.addIdleObserver(
+ this,
+ CATEGORIZATION_SETTINGS.IDLE_TIMEOUT_SECONDS
+ );
+
+ Services.obs.addObserver(this, "quit-application");
+ Services.obs.addObserver(this, "wake_notification");
+
+ this.#init = true;
+ }
+
+ uninit() {
+ if (!this.#init) {
+ return;
+ }
+
+ this.#browserToCallbackMap = null;
+
+ lazy.logConsole.debug("Un-initializing categorization event scheduler.");
+ this.#idleService.removeIdleObserver(
+ this,
+ CATEGORIZATION_SETTINGS.IDLE_TIMEOUT_SECONDS
+ );
+
+ Services.obs.removeObserver(this, "quit-application");
+ Services.obs.removeObserver(this, "wake_notification");
+
+ this.#idleService = null;
+ this.#init = false;
+ }
+
+ observe(subject, topic, data) {
+ switch (topic) {
+ case "idle":
+ lazy.logConsole.debug("Triggering all callbacks due to idle.");
+ this.#sendAllCallbacks();
+ break;
+ case "quit-application":
+ this.uninit();
+ break;
+ case "wake_notification":
+ if (
+ this.#mostRecentMs &&
+ Date.now() - this.#mostRecentMs >=
+ CATEGORIZATION_SETTINGS.WAKE_TIMEOUT_MS
+ ) {
+ lazy.logConsole.debug(
+ "Triggering all callbacks due to a wake notification."
+ );
+ this.#sendAllCallbacks();
+ }
+ break;
+ }
+ }
+
+ addCallback(browser, callback) {
+ lazy.logConsole.debug("Adding callback to queue.");
+ this.#mostRecentMs = Date.now();
+ this.#browserToCallbackMap?.set(browser, callback);
+ }
+
+ sendCallback(browser) {
+ let callback = this.#browserToCallbackMap?.get(browser);
+ if (callback) {
+ lazy.logConsole.debug("Triggering callback.");
+ callback();
+ Services.obs.notifyObservers(
+ null,
+ "recorded-single-categorization-event"
+ );
+ this.#browserToCallbackMap.delete(browser);
+ }
+ }
+
+ #sendAllCallbacks() {
+ let browsers = ChromeUtils.nondeterministicGetWeakMapKeys(
+ this.#browserToCallbackMap
+ );
+ if (browsers) {
+ lazy.logConsole.debug("Triggering all callbacks.");
+ for (let browser of browsers) {
+ this.sendCallback(browser);
+ }
+ }
+ this.#mostRecentMs = null;
+ Services.obs.notifyObservers(null, "recorded-all-categorization-events");
+ }
+}
+
+/**
+ * Handles reporting SERP categorization telemetry to Glean.
+ */
+class CategorizationRecorder {
+ /**
+ * Helper function for recording the SERP categorization event.
+ *
+ * @param {RecordCategorizationParameters} resultToReport
+ * The object containing all the data required to report.
+ */
+ recordCategorizationTelemetry(resultToReport) {
+ lazy.logConsole.debug(
+ "Reporting the following categorization result:",
+ resultToReport
+ );
+ // TODO: Bug 1868476 - Report result to Glean.
+ }
+}
+
+/**
+ * @typedef {object} DomainToCategoriesRecord
+ * @property {number} version
+ * The version of the record.
+ */
+
+/**
+ * @typedef {object} DomainCategoryScore
+ * @property {number} category
+ * The index of the category.
+ * @property {number} score
+ * The score associated with the category.
+ */
+
+/**
+ * Maps domain to categories, with data synced with Remote Settings.
+ */
+class DomainToCategoriesMap {
+ /**
+ * Contains the domain to category scores.
+ *
+ * @type {Object<string, Array<DomainCategoryScore>> | null}
+ */
+ #map = null;
+
+ /**
+ * Latest version number of the attachments.
+ *
+ * @type {number | null}
+ */
+ #version = null;
+
+ /**
+ * The Remote Settings client.
+ *
+ * @type {object | null}
+ */
+ #client = null;
+
+ /**
+ * Whether this is synced with Remote Settings.
+ *
+ * @type {boolean}
+ */
+ #init = false;
+
+ /**
+ * Callback when Remote Settings syncs.
+ *
+ * @type {Function | null}
+ */
+ #onSettingsSync = null;
+
+ /**
+ * When downloading an attachment from Remote Settings fails, this will
+ * contain a timer which will eventually attempt to retry downloading
+ * attachments.
+ */
+ #downloadTimer = null;
+
+ /**
+ * Number of times this has attempted to try another download. Will reset
+ * if the categorization preference has been toggled, or a sync event has
+ * been detected.
+ *
+ * @type {number}
+ */
+ #downloadRetries = 0;
+
+ /**
+ * Runs at application startup with startup idle tasks. If the SERP
+ * categorization preference is enabled, it creates a Remote Settings
+ * client to listen to updates, and populates the map.
+ */
+ async init() {
+ if (!lazy.serpEventTelemetryCategorization || this.#init) {
+ return;
+ }
+ lazy.logConsole.debug("Initializing domain-to-categories map.");
+ this.#setupClientAndMap();
+ this.#init = true;
+ }
+
+ uninit() {
+ if (this.#init) {
+ lazy.logConsole.debug("Un-initializing domain-to-categories map.");
+ this.#clearClientAndMap();
+ this.#cancelAndNullifyTimer();
+ this.#init = false;
+ }
+ }
+
+ /**
+ * Given a domain, find categories and relevant scores.
+ *
+ * @param {string} domain Domain to lookup.
+ * @returns {Array<DomainCategoryScore>}
+ * An array containing categories and their respective score. If no record
+ * for the domain is available, return an empty array.
+ */
+ get(domain) {
+ if (this.empty) {
+ return [];
+ }
+ lazy.gCryptoHash.init(lazy.gCryptoHash.MD5);
+ let bytes = new TextEncoder().encode(domain);
+ lazy.gCryptoHash.update(bytes, domain.length);
+ let hash = lazy.gCryptoHash.finish(true);
+ let rawValues = this.#map[hash] ?? [];
+ if (rawValues.length) {
+ let output = [];
+ // Transform data into a more readable format.
+ // [x, y] => { category: x, score: y }
+ for (let i = 0; i < rawValues.length; i += 2) {
+ output.push({ category: rawValues[i], score: rawValues[i + 1] });
+ }
+ return output;
+ }
+ return [];
+ }
+
+ /**
+ * If the map was initialized, returns the version number for the data.
+ * The version number is determined by the record with the highest version
+ * number. Even if the records have different versions, only records from the
+ * latest version should be available. Returns null if the map was not
+ * initialized.
+ *
+ * @returns {null | number} The version number.
+ */
+ get version() {
+ return this.#version;
+ }
+
+ /**
+ * Whether the map is empty of data.
+ *
+ * @returns {boolean}
+ */
+ get empty() {
+ return !this.#map;
+ }
+
+ /**
+ * Unit test-only function, used to override the domainToCategoriesMap so
+ * that tests can set it to easy to test values.
+ *
+ * @param {object} domainToCategoriesMap
+ * An object where the key is a hashed domain and the value is an array
+ * containing an arbitrary number of DomainCategoryScores.
+ */
+ overrideMapForTests(domainToCategoriesMap) {
+ this.#map = domainToCategoriesMap;
+ }
+
+ async #setupClientAndMap() {
+ if (this.#client && !this.empty) {
+ return;
+ }
+ lazy.logConsole.debug("Setting up domain-to-categories map.");
+ this.#client = lazy.RemoteSettings(TELEMETRY_CATEGORIZATION_KEY);
+
+ this.#onSettingsSync = event => this.#sync(event.data);
+ this.#client.on("sync", this.#onSettingsSync);
+
+ let records = await this.#client.get();
+ await this.#clearAndPopulateMap(records);
+ }
+
+ #clearClientAndMap() {
+ if (this.#client) {
+ lazy.logConsole.debug("Removing Remote Settings client.");
+ this.#client.off("sync", this.#onSettingsSync);
+ this.#client = null;
+ this.#onSettingsSync = null;
+ this.#downloadRetries = 0;
+ }
+
+ if (this.#map) {
+ lazy.logConsole.debug("Clearing domain-to-categories map.");
+ this.#map = null;
+ this.#version = null;
+ }
+ }
+
+ /**
+ * Inspects a list of records from the categorization domain bucket and finds
+ * the maximum version score from the set of records. Each record should have
+ * the same version number but if for any reason one entry has a lower
+ * version number, the latest version can be used to filter it out.
+ *
+ * @param {Array<DomainToCategoriesRecord>} records
+ * An array containing the records from a Remote Settings collection.
+ * @returns {number}
+ */
+ #retrieveLatestVersion(records) {
+ return records.reduce((version, record) => {
+ if (record.version > version) {
+ return record.version;
+ }
+ return version;
+ }, 0);
+ }
+
+ /**
+ * Callback when Remote Settings has indicated the collection has been
+ * synced. Since the records in the collection will be updated all at once,
+ * use the array of current records which at this point in time would have
+ * the latest records from Remote Settings. Additionally, delete any
+ * attachment for records that no longer exist.
+ *
+ * @param {object} data
+ * Object containing records that are current, deleted, created, or updated.
+ *
+ */
+ async #sync(data) {
+ lazy.logConsole.debug("Syncing domain-to-categories with Remote Settings.");
+
+ // Remove local files of deleted records.
+ let toDelete = data?.deleted.filter(d => d.attachment);
+ await Promise.all(
+ toDelete.map(record => this.#client.attachments.deleteDownloaded(record))
+ );
+
+ // In case a user encountered network failures in the past and kept their
+ // session on, this will ensure the next sync event will retry downloading
+ // again in case there's a new download error.
+ this.#downloadRetries = 0;
+
+ this.#clearAndPopulateMap(data?.current);
+ }
+
+ /**
+ * Clear the existing map and populate it with attachments found in the
+ * records. If no attachments are found, or no record containing an
+ * attachment contained the latest version, then nothing will change.
+ *
+ * @param {Array<DomainToCategoriesRecord>} records
+ * The records containing attachments.
+ *
+ */
+ async #clearAndPopulateMap(records) {
+ // Set map to null so that if there are errors in the downloads, consumers
+ // will be able to know whether the map has information. Once we've
+ // successfully downloaded attachments and are parsing them, a non-null
+ // object will be created.
+ this.#map = null;
+ this.#version = null;
+ this.#cancelAndNullifyTimer();
+
+ if (!records?.length) {
+ lazy.logConsole.debug("No records found for domain-to-categories map.");
+ return;
+ }
+
+ let fileContents = [];
+ for (let record of records) {
+ let result;
+ // Downloading attachments can fail.
+ try {
+ result = await this.#client.attachments.download(record);
+ } catch (ex) {
+ lazy.logConsole.error("Could not download file:", ex);
+ this.#createTimerToPopulateMap();
+ return;
+ }
+ fileContents.push(result.buffer);
+ }
+
+ // All attachments should have the same version number. If for whatever
+ // reason they don't, we should only use the attachments with the latest
+ // version.
+ this.#version = this.#retrieveLatestVersion(records);
+
+ if (!this.#version) {
+ lazy.logConsole.debug("Could not find a version number for any record.");
+ return;
+ }
+
+ // Queue the series of assignments.
+ for (let i = 0; i < fileContents.length; ++i) {
+ let buffer = fileContents[i];
+ Services.tm.idleDispatchToMainThread(() => {
+ let start = Cu.now();
+ let json;
+ try {
+ json = JSON.parse(new TextDecoder().decode(buffer));
+ } catch (ex) {
+ // TODO: If there was an error decoding the buffer, we may want to
+ // dispatch an error in telemetry or try again.
+ return;
+ }
+ ChromeUtils.addProfilerMarker(
+ "SearchSERPTelemetry.#clearAndPopulateMap",
+ start,
+ "Convert buffer to JSON."
+ );
+ if (!this.#map) {
+ this.#map = {};
+ }
+ Object.assign(this.#map, json);
+ lazy.logConsole.debug("Updated domain-to-categories map.");
+ if (i == fileContents.length - 1) {
+ Services.obs.notifyObservers(
+ null,
+ "domain-to-categories-map-update-complete"
+ );
+ }
+ });
+ }
+ }
+
+ #cancelAndNullifyTimer() {
+ if (this.#downloadTimer) {
+ lazy.logConsole.debug("Cancel and nullify download timer.");
+ this.#downloadTimer.cancel();
+ this.#downloadTimer = null;
+ }
+ }
+
+ #createTimerToPopulateMap() {
+ if (
+ this.#downloadRetries >=
+ TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.maxTriesPerSession
+ ) {
+ return;
+ }
+ if (!this.#downloadTimer) {
+ this.#downloadTimer = Cc["@mozilla.org/timer;1"].createInstance(
+ Ci.nsITimer
+ );
+ }
+ lazy.logConsole.debug("Create timer to retry downloading attachments.");
+ let delay =
+ TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.base +
+ randomInteger(
+ TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.minAdjust,
+ TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.maxAdjust
+ );
+ this.#downloadTimer.initWithCallback(
+ async () => {
+ this.#downloadRetries += 1;
+ let records = await this.#client.get();
+ this.#clearAndPopulateMap(records);
+ },
+ delay,
+ Ci.nsITimer.TYPE_ONE_SHOT
+ );
+ }
+}
+
+function randomInteger(min, max) {
+ return Math.floor(Math.random() * (max - min + 1)) + min;
+}
+
+export var SearchSERPDomainToCategoriesMap = new DomainToCategoriesMap();
+export var SearchSERPTelemetry = new TelemetryHandler();
+export var SearchSERPCategorization = new SERPCategorizer();
+export var SERPCategorizationRecorder = new CategorizationRecorder();
+export var SearchSERPCategorizationEventScheduler =
+ new CategorizationEventScheduler();