diff options
Diffstat (limited to 'browser/components/search/SearchSERPTelemetry.sys.mjs')
-rw-r--r-- | browser/components/search/SearchSERPTelemetry.sys.mjs | 2515 |
1 files changed, 2515 insertions, 0 deletions
diff --git a/browser/components/search/SearchSERPTelemetry.sys.mjs b/browser/components/search/SearchSERPTelemetry.sys.mjs new file mode 100644 index 0000000000..00105241bb --- /dev/null +++ b/browser/components/search/SearchSERPTelemetry.sys.mjs @@ -0,0 +1,2515 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs", + PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs", + Region: "resource://gre/modules/Region.sys.mjs", + RemoteSettings: "resource://services-settings/remote-settings.sys.mjs", + SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs", +}); + +ChromeUtils.defineLazyGetter(lazy, "gCryptoHash", () => { + return Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash); +}); + +// The various histograms and scalars that we report to. +const SEARCH_CONTENT_SCALAR_BASE = "browser.search.content."; +const SEARCH_WITH_ADS_SCALAR_BASE = "browser.search.withads."; +const SEARCH_AD_CLICKS_SCALAR_BASE = "browser.search.adclicks."; +const SEARCH_DATA_TRANSFERRED_SCALAR = "browser.search.data_transferred"; +const SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX = "pb"; + +// Exported for tests. +export const ADLINK_CHECK_TIMEOUT_MS = 1000; +// Unlike the standard adlink check, the timeout for single page apps is not +// based on a content event within the page, like DOMContentLoaded or load. +// Thus, we aim for a longer timeout to account for when the server might be +// slow to update the content on the page. +export const SPA_ADLINK_CHECK_TIMEOUT_MS = 2500; +export const TELEMETRY_SETTINGS_KEY = "search-telemetry-v2"; +export const TELEMETRY_CATEGORIZATION_KEY = "search-categorization"; +export const TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS = { + // Units are in milliseconds. + base: 3600000, + minAdjust: 60000, + maxAdjust: 600000, + maxTriesPerSession: 2, +}; + +export const SEARCH_TELEMETRY_SHARED = { + PROVIDER_INFO: "SearchTelemetry:ProviderInfo", + LOAD_TIMEOUT: "SearchTelemetry:LoadTimeout", + SPA_LOAD_TIMEOUT: "SearchTelemetry:SPALoadTimeout", +}; + +const impressionIdsWithoutEngagementsSet = new Set(); + +export const CATEGORIZATION_SETTINGS = { + MAX_DOMAINS_TO_CATEGORIZE: 10, + MINIMUM_SCORE: 0, + STARTING_RANK: 2, + IDLE_TIMEOUT_SECONDS: 60 * 60, + WAKE_TIMEOUT_MS: 60 * 60 * 1000, +}; + +ChromeUtils.defineLazyGetter(lazy, "logConsole", () => { + return console.createInstance({ + prefix: "SearchTelemetry", + maxLogLevel: lazy.SearchUtils.loggingEnabled ? "Debug" : "Warn", + }); +}); + +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "serpEventsEnabled", + "browser.search.serpEventTelemetry.enabled", + true +); + +const CATEGORIZATION_PREF = + "browser.search.serpEventTelemetryCategorization.enabled"; + +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "serpEventTelemetryCategorization", + CATEGORIZATION_PREF, + false, + (aPreference, previousValue, newValue) => { + if (newValue) { + SearchSERPDomainToCategoriesMap.init(); + SearchSERPCategorizationEventScheduler.init(); + } else { + SearchSERPDomainToCategoriesMap.uninit(); + SearchSERPCategorizationEventScheduler.uninit(); + } + } +); + +export const SearchSERPTelemetryUtils = { + ACTIONS: { + CLICKED: "clicked", + EXPANDED: "expanded", + SUBMITTED: "submitted", + }, + COMPONENTS: { + AD_CAROUSEL: "ad_carousel", + AD_IMAGE_ROW: "ad_image_row", + AD_LINK: "ad_link", + AD_SIDEBAR: "ad_sidebar", + AD_SITELINK: "ad_sitelink", + INCONTENT_SEARCHBOX: "incontent_searchbox", + NON_ADS_LINK: "non_ads_link", + REFINED_SEARCH_BUTTONS: "refined_search_buttons", + SHOPPING_TAB: "shopping_tab", + }, + ABANDONMENTS: { + NAVIGATION: "navigation", + TAB_CLOSE: "tab_close", + WINDOW_CLOSE: "window_close", + }, + INCONTENT_SOURCES: { + OPENED_IN_NEW_TAB: "opened_in_new_tab", + REFINE_ON_SERP: "follow_on_from_refine_on_SERP", + SEARCHBOX: "follow_on_from_refine_on_incontent_search", + }, + CATEGORIZATION: { + INCONCLUSIVE: 0, + }, +}; + +const AD_COMPONENTS = [ + SearchSERPTelemetryUtils.COMPONENTS.AD_CAROUSEL, + SearchSERPTelemetryUtils.COMPONENTS.AD_IMAGE_ROW, + SearchSERPTelemetryUtils.COMPONENTS.AD_LINK, + SearchSERPTelemetryUtils.COMPONENTS.AD_SIDEBAR, + SearchSERPTelemetryUtils.COMPONENTS.AD_SITELINK, +]; + +/** + * TelemetryHandler is the main class handling Search Engine Result Page (SERP) + * telemetry. It primarily deals with tracking of what pages are loaded into tabs. + * + * It handles the *in-content:sap* keys of the SEARCH_COUNTS histogram. + */ +class TelemetryHandler { + // Whether or not this class is initialised. + _initialized = false; + + // An instance of ContentHandler. + _contentHandler; + + // The original provider information, mainly used for tests. + _originalProviderInfo = null; + + // The current search provider info. + _searchProviderInfo = null; + + // An instance of remote settings that is used to access the provider info. + _telemetrySettings; + + // Callback used when syncing telemetry settings. + #telemetrySettingsSync; + + // _browserInfoByURL is a map of tracked search urls to objects containing: + // * {object} info + // the search provider information associated with the url. + // * {WeakMap} browserTelemetryStateMap + // a weak map of browsers that have the url loaded, their ad report state, + // and their impression id. + // * {integer} count + // a manual count of browsers logged. + // We keep a weak map of browsers, in case we miss something on our counts + // and cause a memory leak - worst case our map is slightly bigger than it + // needs to be. + // The manual count is because WeakMap doesn't give us size/length + // information, but we want to know when we can clean up our associated + // entry. + _browserInfoByURL = new Map(); + + // Browser objects mapped to the info in _browserInfoByURL. + #browserToItemMap = new WeakMap(); + + // _browserSourceMap is a map of the latest search source for a particular + // browser - one of the KNOWN_SEARCH_SOURCES in BrowserSearchTelemetry. + _browserSourceMap = new WeakMap(); + + /** + * A WeakMap whose key is a browser with value of a source type found in + * INCONTENT_SOURCES. Kept separate to avoid overlapping with legacy + * search sources. These sources are specific to the content of a search + * provider page rather than something from within the browser itself. + */ + #browserContentSourceMap = new WeakMap(); + + /** + * Sets the source of a SERP visit from something that occured in content + * rather than from the browser. + * + * @param {browser} browser + * The browser object associated with the page that should be a SERP. + * @param {string} source + * The source that started the load. One of + * SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX, + * SearchSERPTelemetryUtils.INCONTENT_SOURCES.OPENED_IN_NEW_TAB or + * SearchSERPTelemetryUtils.INCONTENT_SOURCES.REFINE_ON_SERP. + */ + setBrowserContentSource(browser, source) { + this.#browserContentSourceMap.set(browser, source); + } + + // _browserNewtabSessionMap is a map of the newtab session id for particular + // browsers. + _browserNewtabSessionMap = new WeakMap(); + + constructor() { + this._contentHandler = new ContentHandler({ + browserInfoByURL: this._browserInfoByURL, + findBrowserItemForURL: (...args) => this._findBrowserItemForURL(...args), + checkURLForSerpMatch: (...args) => this._checkURLForSerpMatch(...args), + findItemForBrowser: (...args) => this.findItemForBrowser(...args), + }); + } + + /** + * Initializes the TelemetryHandler and its ContentHandler. It will add + * appropriate listeners to the window so that window opening and closing + * can be tracked. + */ + async init() { + if (this._initialized) { + return; + } + + this._telemetrySettings = lazy.RemoteSettings(TELEMETRY_SETTINGS_KEY); + let rawProviderInfo = []; + try { + rawProviderInfo = await this._telemetrySettings.get(); + } catch (ex) { + lazy.logConsole.error("Could not get settings:", ex); + } + + this.#telemetrySettingsSync = event => this.#onSettingsSync(event); + this._telemetrySettings.on("sync", this.#telemetrySettingsSync); + + // Send the provider info to the child handler. + this._contentHandler.init(rawProviderInfo); + this._originalProviderInfo = rawProviderInfo; + + // Now convert the regexps into + this._setSearchProviderInfo(rawProviderInfo); + + for (let win of Services.wm.getEnumerator("navigator:browser")) { + this._registerWindow(win); + } + Services.wm.addListener(this); + + this._initialized = true; + } + + async #onSettingsSync(event) { + let current = event.data?.current; + if (current) { + lazy.logConsole.debug( + "Update provider info due to Remote Settings sync." + ); + this._originalProviderInfo = current; + this._setSearchProviderInfo(current); + Services.ppmm.sharedData.set( + SEARCH_TELEMETRY_SHARED.PROVIDER_INFO, + current + ); + Services.ppmm.sharedData.flush(); + } else { + lazy.logConsole.debug( + "Ignoring Remote Settings sync data due to missing records." + ); + } + Services.obs.notifyObservers(null, "search-telemetry-v2-synced"); + } + + /** + * Uninitializes the TelemetryHandler and its ContentHandler. + */ + uninit() { + if (!this._initialized) { + return; + } + + this._contentHandler.uninit(); + + for (let win of Services.wm.getEnumerator("navigator:browser")) { + this._unregisterWindow(win); + } + Services.wm.removeListener(this); + + try { + this._telemetrySettings.off("sync", this.#telemetrySettingsSync); + } catch (ex) { + lazy.logConsole.error( + "Failed to shutdown SearchSERPTelemetry Remote Settings.", + ex + ); + } + this._telemetrySettings = null; + this.#telemetrySettingsSync = null; + + this._initialized = false; + } + + /** + * Records the search source for particular browsers, in case it needs + * to be associated with a SERP. + * + * @param {browser} browser + * The browser where the search originated. + * @param {string} source + * Where the search originated from. + */ + recordBrowserSource(browser, source) { + this._browserSourceMap.set(browser, source); + } + + /** + * Records the newtab source for particular browsers, in case it needs + * to be associated with a SERP. + * + * @param {browser} browser + * The browser where the search originated. + * @param {string} newtabSessionId + * The sessionId of the newtab session the search originated from. + */ + recordBrowserNewtabSession(browser, newtabSessionId) { + this._browserNewtabSessionMap.set(browser, newtabSessionId); + } + + /** + * Helper function for recording the reason for a Glean abandonment event. + * + * @param {string} impressionId + * The impression id for the abandonment event about to be recorded. + * @param {string} reason + * The reason the SERP is deemed abandoned. + * One of SearchSERPTelemetryUtils.ABANDONMENTS. + */ + recordAbandonmentTelemetry(impressionId, reason) { + impressionIdsWithoutEngagementsSet.delete(impressionId); + + lazy.logConsole.debug( + `Recording an abandonment event for impression id ${impressionId} with reason: ${reason}` + ); + + Glean.serp.abandonment.record({ + impression_id: impressionId, + reason, + }); + } + + /** + * Handles the TabClose event received from the listeners. + * + * @param {object} event + * The event object provided by the listener. + */ + handleEvent(event) { + if (event.type != "TabClose") { + console.error("Received unexpected event type", event.type); + return; + } + + this._browserNewtabSessionMap.delete(event.target.linkedBrowser); + this.stopTrackingBrowser( + event.target.linkedBrowser, + SearchSERPTelemetryUtils.ABANDONMENTS.TAB_CLOSE + ); + } + + /** + * Test-only function, used to override the provider information, so that + * unit tests can set it to easy to test values. + * + * @param {Array} providerInfo + * See {@link https://searchfox.org/mozilla-central/search?q=search-telemetry-schema.json} + * for type information. + */ + overrideSearchTelemetryForTests(providerInfo) { + let info = providerInfo ? providerInfo : this._originalProviderInfo; + this._contentHandler.overrideSearchTelemetryForTests(info); + this._setSearchProviderInfo(info); + } + + /** + * Used to set the local version of the search provider information. + * This automatically maps the regexps to RegExp objects so that + * we don't have to create a new instance each time. + * + * @param {Array} providerInfo + * A raw array of provider information to set. + */ + _setSearchProviderInfo(providerInfo) { + this._searchProviderInfo = providerInfo.map(provider => { + let newProvider = { + ...provider, + searchPageRegexp: new RegExp(provider.searchPageRegexp), + }; + if (provider.extraAdServersRegexps) { + newProvider.extraAdServersRegexps = provider.extraAdServersRegexps.map( + r => new RegExp(r) + ); + } + + newProvider.nonAdsLinkRegexps = provider.nonAdsLinkRegexps?.length + ? provider.nonAdsLinkRegexps.map(r => new RegExp(r)) + : []; + if (provider.shoppingTab?.regexp) { + newProvider.shoppingTab = { + selector: provider.shoppingTab.selector, + regexp: new RegExp(provider.shoppingTab.regexp), + }; + } + return newProvider; + }); + this._contentHandler._searchProviderInfo = this._searchProviderInfo; + } + + reportPageAction(info, browser) { + this._contentHandler._reportPageAction(info, browser); + } + + reportPageWithAds(info, browser) { + this._contentHandler._reportPageWithAds(info, browser); + } + + reportPageWithAdImpressions(info, browser) { + this._contentHandler._reportPageWithAdImpressions(info, browser); + } + + reportPageDomains(info, browser) { + this._contentHandler._reportPageDomains(info, browser); + } + + reportPageImpression(info, browser) { + this._contentHandler._reportPageImpression(info, browser); + } + + /** + * This may start tracking a tab based on the URL. If the URL matches a search + * partner, and it has a code, then we'll start tracking it. This will aid + * determining if it is a page we should be tracking for adverts. + * + * @param {object} browser + * The browser associated with the page. + * @param {string} url + * The url that was loaded in the browser. + * @param {nsIDocShell.LoadCommand} loadType + * The load type associated with the page load. + */ + updateTrackingStatus(browser, url, loadType) { + if ( + !lazy.BrowserSearchTelemetry.shouldRecordSearchCount( + browser.getTabBrowser() + ) + ) { + return; + } + let info = this._checkURLForSerpMatch(url); + if (!info) { + this._browserNewtabSessionMap.delete(browser); + this.stopTrackingBrowser(browser); + return; + } + + let source = "unknown"; + if (loadType & Ci.nsIDocShell.LOAD_CMD_RELOAD) { + source = "reload"; + } else if (loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY) { + source = "tabhistory"; + } else if (this._browserSourceMap.has(browser)) { + source = this._browserSourceMap.get(browser); + this._browserSourceMap.delete(browser); + } + + // If it's a SERP but doesn't have a browser source, the source might be + // from something that happened in content. We keep this separate from + // source because legacy telemetry should not change its reporting. + let inContentSource; + if ( + lazy.serpEventsEnabled && + info.hasComponents && + this.#browserContentSourceMap.has(browser) + ) { + inContentSource = this.#browserContentSourceMap.get(browser); + this.#browserContentSourceMap.delete(browser); + } + + let newtabSessionId; + if (this._browserNewtabSessionMap.has(browser)) { + newtabSessionId = this._browserNewtabSessionMap.get(browser); + // We leave the newtabSessionId in the map for this browser + // until we stop loading SERP pages or the tab is closed. + } + + let impressionId; + if (lazy.serpEventsEnabled && info.hasComponents) { + // The UUID generated by Services.uuid contains leading and trailing braces. + // Need to trim them first. + impressionId = Services.uuid.generateUUID().toString().slice(1, -1); + + impressionIdsWithoutEngagementsSet.add(impressionId); + } + + this._reportSerpPage(info, source, url); + + // For single page apps, we store the page by its original URI so the + // network observers can recover the browser in a context when they only + // have access to the originURL. + let urlKey = + info.isSPA && browser.originalURI?.spec ? browser.originalURI.spec : url; + let item = this._browserInfoByURL.get(urlKey); + + let impressionInfo; + if (lazy.serpEventsEnabled && info.hasComponents) { + let partnerCode = ""; + if (info.code != "none" && info.code != null) { + partnerCode = info.code; + } + impressionInfo = { + provider: info.provider, + tagged: info.type.startsWith("tagged"), + partnerCode, + source: inContentSource ?? source, + isShoppingPage: info.isShoppingPage, + isPrivate: lazy.PrivateBrowsingUtils.isBrowserPrivate(browser), + }; + } + + if (item) { + item.browserTelemetryStateMap.set(browser, { + adsReported: false, + adImpressionsReported: false, + impressionId, + urlToComponentMap: null, + impressionInfo, + searchBoxSubmitted: false, + categorizationInfo: null, + adsClicked: 0, + adsVisible: 0, + searchQuery: info.searchQuery, + }); + item.count++; + item.source = source; + item.newtabSessionId = newtabSessionId; + } else { + item = { + browserTelemetryStateMap: new WeakMap().set(browser, { + adsReported: false, + adImpressionsReported: false, + impressionId, + urlToComponentMap: null, + impressionInfo, + searchBoxSubmitted: false, + categorizationInfo: null, + adsClicked: 0, + adsVisible: 0, + searchQuery: info.searchQuery, + }), + info, + count: 1, + source, + newtabSessionId, + majorVersion: parseInt(Services.appinfo.version), + channel: lazy.SearchUtils.MODIFIED_APP_CHANNEL, + region: lazy.Region.home, + isSPA: info.isSPA, + }; + // For single page apps, we store the page by its original URI so that + // network observers can recover the browser in a context when they only + // have the originURL to work with. + this._browserInfoByURL.set(urlKey, item); + } + this.#browserToItemMap.set(browser, item); + } + + /** + * Determines whether or not a browser should be untracked or tracked for + * SERPs who have single page app behaviour. + * + * The over-arching logic: + * 1. Only inspect the browser if the url matches a SERP that is a SPA. + * 2. Recording an engagement if we're tracking the browser and we're going + * to another page. + * 3. Untrack the browser if we're tracking it and switching pages. + * 4. Track the browser if we're now on a default search page. + * + * @param {BrowserElement} browser + * The browser element related to the request. + * @param {string} url + * The url of the request. + * @param {number} loadType + * The loadtype of a the request. + */ + async updateTrackingSinglePageApp(browser, url, loadType) { + let providerInfo = this._getProviderInfoForURL(url); + if (!providerInfo?.isSPA) { + return; + } + + let item = this.findItemForBrowser(browser); + let telemetryState = item?.browserTelemetryStateMap.get(browser); + + let previousSearchTerm = telemetryState?.searchQuery ?? ""; + let searchTerm = this.urlSearchTerms(url, providerInfo); + let searchTermChanged = previousSearchTerm !== searchTerm; + + let isSerp = !!this._checkURLForSerpMatch(url, providerInfo); + let browserIsTracked = !!telemetryState; + let isTabHistory = loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY; + + // Step 2: Maybe record engagement. + if (browserIsTracked && !isTabHistory && (searchTermChanged || !isSerp)) { + // If we've established we've changed to another SERP, the cause could be + // from a submission event inside the content process. The event is + // sent to the parent and stored as `telemetryState.searchBoxSubmitted` + // but if we check now, it may be too early. Instead, we check with the + // content process directly to see if it recorded a submit event. + let actor = browser.browsingContext.currentWindowGlobal.getActor( + "SearchSERPTelemetry" + ); + let didSubmit = await actor.sendQuery("SearchSERPTelemetry:DidSubmit"); + + if (telemetryState && !telemetryState.searchBoxSubmitted && !didSubmit) { + impressionIdsWithoutEngagementsSet.delete(telemetryState.impressionId); + Glean.serp.engagement.record({ + impression_id: telemetryState.impressionId, + action: SearchSERPTelemetryUtils.ACTIONS.CLICKED, + target: SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK, + }); + lazy.logConsole.debug("Counting click:", { + impressionId: telemetryState.impressionId, + type: SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK, + URL: url, + }); + } + } + + // Step 3: Maybe untrack the browser. + if (browserIsTracked && (searchTermChanged || !isSerp)) { + let reason = ""; + // If we have to untrack it, it might be due to the user using the + // back/forward button. + if (isTabHistory) { + reason = SearchSERPTelemetryUtils.ABANDONMENTS.NAVIGATION; + } + let actor = browser.browsingContext.currentWindowGlobal.getActor( + "SearchSERPTelemetry" + ); + actor.sendAsyncMessage("SearchSERPTelemetry:StopTrackingDocument"); + this.stopTrackingBrowser(browser, reason); + browserIsTracked = false; + } + + // Step 4: Maybe track the browser. + if (isSerp && !browserIsTracked) { + this.updateTrackingStatus(browser, url, loadType); + let actor = browser.browsingContext.currentWindowGlobal.getActor( + "SearchSERPTelemetry" + ); + actor.sendAsyncMessage("SearchSERPTelemetry:WaitForSPAPageLoad"); + } + } + + /** + * Stops tracking of a tab, for example the tab has loaded a different URL. + * Also records a Glean abandonment event if appropriate. + * + * @param {object} browser The browser associated with the tab to stop being + * tracked. + * @param {string} abandonmentReason + * An optional parameter that specifies why the browser is deemed abandoned. + * The reason will be recorded as part of Glean abandonment telemetry. + * One of SearchSERPTelemetryUtils.ABANDONMENTS. + */ + stopTrackingBrowser(browser, abandonmentReason) { + for (let [url, item] of this._browserInfoByURL) { + if (item.browserTelemetryStateMap.has(browser)) { + let telemetryState = item.browserTelemetryStateMap.get(browser); + let impressionId = telemetryState.impressionId; + if (impressionIdsWithoutEngagementsSet.has(impressionId)) { + this.recordAbandonmentTelemetry(impressionId, abandonmentReason); + } + + if ( + lazy.serpEventTelemetryCategorization && + telemetryState.categorizationInfo + ) { + SearchSERPCategorizationEventScheduler.sendCallback(browser); + } + + item.browserTelemetryStateMap.delete(browser); + item.count--; + } + + if (!item.count) { + this._browserInfoByURL.delete(url); + } + } + this.#browserToItemMap.delete(browser); + } + + /** + * Calculate how close two urls are in equality. + * + * The scoring system: + * - If the URLs look exactly the same, including the ordering of query + * parameters, the score is Infinity. + * - If the origin is the same, the score is increased by 1. Otherwise the + * score is 0. + * - If the path is the same, the score is increased by 1. + * - For each query parameter, if the key exists the score is increased by 1. + * Likewise if the query parameter values match. + * - If the hash is the same, the score is increased by 1. This includes if + * the hash is missing in both URLs. + * + * @param {URL} url1 + * Url to compare. + * @param {URL} url2 + * Other url to compare. Ordering shouldn't matter. + * @param {object} [matchOptions] + * Options for checking equality. + * @param {boolean} [matchOptions.path] + * Whether the path must match. Default to false. + * @param {boolean} [matchOptions.paramValues] + * Whether the values of the query parameters must match if the query + * parameter key exists in the other. Defaults to false. + * @returns {number} + * A score of how closely the two URLs match. Returns 0 if there is no + * match or the equality check failed for an enabled match option. + */ + compareUrls(url1, url2, matchOptions = {}) { + // In case of an exact match, well, that's an obvious winner. + if (url1.href == url2.href) { + return Infinity; + } + + // Each step we get closer to the two URLs being the same, we increase the + // score. The consumer of this method will use these scores to see which + // of the URLs is the best match. + let score = 0; + if (url1.origin == url2.origin) { + ++score; + if (url1.pathname == url2.pathname) { + ++score; + for (let [key1, value1] of url1.searchParams) { + // Let's not fuss about the ordering of search params, since the + // score effect will solve that. + if (url2.searchParams.has(key1)) { + ++score; + if (url2.searchParams.get(key1) == value1) { + ++score; + } else if (matchOptions.paramValues) { + return 0; + } + } + } + if (url1.hash == url2.hash) { + ++score; + } + } else if (matchOptions.path) { + return 0; + } + } + return score; + } + + /** + * Extracts the search terms from the URL based on the provider info. + * + * @param {string} url + * The URL to inspect. + * @param {object} providerInfo + * The providerInfo associated with the URL. + * @returns {string} + * The search term or if none is found, a blank string. + */ + urlSearchTerms(url, providerInfo) { + if (providerInfo?.queryParamNames?.length) { + let { searchParams } = new URL(url); + for (let queryParamName of providerInfo.queryParamNames) { + let value = searchParams.get(queryParamName); + if (value) { + return value; + } + } + } + return ""; + } + + findItemForBrowser(browser) { + return this.#browserToItemMap.get(browser); + } + + /** + * Parts of the URL, like search params and hashes, may be mutated by scripts + * on a page we're tracking. Since we don't want to keep track of that + * ourselves in order to keep the list of browser objects a weak-referenced + * set, we do optional fuzzy matching of URLs to fetch the most relevant item + * that contains tracking information. + * + * @param {string} url URL to fetch the tracking data for. + * @returns {object} Map containing the following members: + * - {WeakMap} browsers + * Map of browser elements that belong to `url` and their ad report state. + * - {object} info + * Info dictionary as returned by `_checkURLForSerpMatch`. + * - {number} count + * The number of browser element we can most accurately tell we're + * tracking, since they're inside a WeakMap. + */ + _findBrowserItemForURL(url) { + try { + url = new URL(url); + } catch (ex) { + return null; + } + + let item; + let currentBestMatch = 0; + for (let [trackingURL, candidateItem] of this._browserInfoByURL) { + if (currentBestMatch === Infinity) { + break; + } + try { + // Make sure to cache the parsed URL object, since there's no reason to + // do it twice. + trackingURL = + candidateItem._trackingURL || + (candidateItem._trackingURL = new URL(trackingURL)); + } catch (ex) { + continue; + } + let score = this.compareUrls(url, trackingURL); + if (score > currentBestMatch) { + item = candidateItem; + currentBestMatch = score; + } + } + + return item; + } + + // nsIWindowMediatorListener + + /** + * This is called when a new window is opened, and handles registration of + * that window if it is a browser window. + * + * @param {nsIAppWindow} appWin The xul window that was opened. + */ + onOpenWindow(appWin) { + let win = appWin.docShell.domWindow; + win.addEventListener( + "load", + () => { + if ( + win.document.documentElement.getAttribute("windowtype") != + "navigator:browser" + ) { + return; + } + + this._registerWindow(win); + }, + { once: true } + ); + } + + /** + * Listener that is called when a window is closed, and handles deregistration of + * that window if it is a browser window. + * + * @param {nsIAppWindow} appWin The xul window that was closed. + */ + onCloseWindow(appWin) { + let win = appWin.docShell.domWindow; + + if ( + win.document.documentElement.getAttribute("windowtype") != + "navigator:browser" + ) { + return; + } + + this._unregisterWindow(win); + } + + /** + * Adds event listeners for the window and registers it with the content handler. + * + * @param {object} win The window to register. + */ + _registerWindow(win) { + win.gBrowser.tabContainer.addEventListener("TabClose", this); + } + + /** + * Removes event listeners for the window and unregisters it with the content + * handler. + * + * @param {object} win The window to unregister. + */ + _unregisterWindow(win) { + for (let tab of win.gBrowser.tabs) { + this.stopTrackingBrowser( + tab.linkedBrowser, + SearchSERPTelemetryUtils.ABANDONMENTS.WINDOW_CLOSE + ); + } + + win.gBrowser.tabContainer.removeEventListener("TabClose", this); + } + + /** + * Searches for provider information for a given url. + * + * @param {string} url The url to match for a provider. + * @returns {Array | null} Returns an array of provider name and the provider information. + */ + _getProviderInfoForURL(url) { + return this._searchProviderInfo.find(info => + info.searchPageRegexp.test(url) + ); + } + + /** + * Checks to see if a url is a search partner location, and determines the + * provider and codes used. + * + * @param {string} url The url to match. + * @returns {null|object} Returns null if there is no match found. Otherwise, + * returns an object of strings for provider, code and type. + */ + _checkURLForSerpMatch(url) { + let searchProviderInfo = this._getProviderInfoForURL(url); + if (!searchProviderInfo) { + return null; + } + + let queries = new URLSearchParams(url.split("#")[0].split("?")[1]); + + let isSPA = !!searchProviderInfo.isSPA; + if (isSPA) { + // A URL may have a specific query parameter denoting a search page. + // If the key was expected but doesn't currently exist, it could be due to + // the initial url containing it until after a page load. + // In that case, ignore this check since most SERPs missing the query + // param will go to the default search page. + let { key, value } = searchProviderInfo.defaultPageQueryParam; + if (key && queries.has(key) && queries.get(key) != value) { + return null; + } + } + + // Some URLs can match provider info but also be the provider's homepage + // instead of a SERP. + // e.g. https://example.com/ vs. https://example.com/?foo=bar + // Look for the presence of the query parameter that contains a search term. + let hasQuery = false; + let searchQuery = ""; + for (let queryParamName of searchProviderInfo.queryParamNames) { + searchQuery = queries.get(queryParamName); + if (searchQuery) { + hasQuery = true; + break; + } + } + if (!hasQuery) { + return null; + } + // Default to organic to simplify things. + // We override type in the sap cases. + let type = "organic"; + let code; + if (searchProviderInfo.codeParamName) { + code = queries.get(searchProviderInfo.codeParamName); + if (code) { + // The code is only included if it matches one of the specific ones. + if (searchProviderInfo.taggedCodes.includes(code)) { + type = "tagged"; + if ( + searchProviderInfo.followOnParamNames && + searchProviderInfo.followOnParamNames.some(p => queries.has(p)) + ) { + type += "-follow-on"; + } + } else if (searchProviderInfo.organicCodes.includes(code)) { + type = "organic"; + } else if (searchProviderInfo.expectedOrganicCodes?.includes(code)) { + code = "none"; + } else { + code = "other"; + } + } else if (searchProviderInfo.followOnCookies) { + // Especially Bing requires lots of extra work related to cookies. + for (let followOnCookie of searchProviderInfo.followOnCookies) { + if (followOnCookie.extraCodeParamName) { + let eCode = queries.get(followOnCookie.extraCodeParamName); + if ( + !eCode || + !followOnCookie.extraCodePrefixes.some(p => eCode.startsWith(p)) + ) { + continue; + } + } + + // If this cookie is present, it's probably an SAP follow-on. + // This might be an organic follow-on in the same session, but there + // is no way to tell the difference. + for (let cookie of Services.cookies.getCookiesFromHost( + followOnCookie.host, + {} + )) { + if (cookie.name != followOnCookie.name) { + continue; + } + + let [cookieParam, cookieValue] = cookie.value + .split("=") + .map(p => p.trim()); + if ( + cookieParam == followOnCookie.codeParamName && + searchProviderInfo.taggedCodes.includes(cookieValue) + ) { + type = "tagged-follow-on"; + code = cookieValue; + break; + } + } + } + } + } + let isShoppingPage = false; + let hasComponents = false; + if (lazy.serpEventsEnabled) { + if (searchProviderInfo.shoppingTab?.regexp) { + isShoppingPage = searchProviderInfo.shoppingTab.regexp.test(url); + } + if (searchProviderInfo.components?.length) { + hasComponents = true; + } + } + return { + provider: searchProviderInfo.telemetryId, + type, + code, + isShoppingPage, + hasComponents, + searchQuery, + isSPA, + }; + } + + /** + * Logs telemetry for a search provider visit. + * + * @param {object} info The search provider information. + * @param {string} info.provider The name of the provider. + * @param {string} info.type The type of search. + * @param {string} [info.code] The code for the provider. + * @param {string} source Where the search originated from. + * @param {string} url The url that was matched (for debug logging only). + */ + _reportSerpPage(info, source, url) { + let payload = `${info.provider}:${info.type}:${info.code || "none"}`; + Services.telemetry.keyedScalarAdd( + SEARCH_CONTENT_SCALAR_BASE + source, + payload, + 1 + ); + lazy.logConsole.debug("Impression:", payload, url); + } +} + +/** + * ContentHandler deals with handling telemetry of the content within a tab - + * when ads detected and when they are selected. + */ +class ContentHandler { + /** + * Constructor. + * + * @param {object} options + * The options for the handler. + * @param {Map} options.browserInfoByURL + * The map of urls from TelemetryHandler. + * @param {Function} options.getProviderInfoForURL + * A function that obtains the provider information for a url. + */ + constructor(options) { + this._browserInfoByURL = options.browserInfoByURL; + this._findBrowserItemForURL = options.findBrowserItemForURL; + this._checkURLForSerpMatch = options.checkURLForSerpMatch; + this._findItemForBrowser = options.findItemForBrowser; + } + + /** + * Initializes the content handler. This will also set up the shared data that is + * shared with the SearchTelemetryChild actor. + * + * @param {Array} providerInfo + * The provider information for the search telemetry to record. + */ + init(providerInfo) { + Services.ppmm.sharedData.set( + SEARCH_TELEMETRY_SHARED.PROVIDER_INFO, + providerInfo + ); + Services.ppmm.sharedData.set( + SEARCH_TELEMETRY_SHARED.LOAD_TIMEOUT, + ADLINK_CHECK_TIMEOUT_MS + ); + Services.ppmm.sharedData.set( + SEARCH_TELEMETRY_SHARED.SPA_LOAD_TIMEOUT, + SPA_ADLINK_CHECK_TIMEOUT_MS + ); + + Services.obs.addObserver(this, "http-on-examine-response"); + Services.obs.addObserver(this, "http-on-examine-cached-response"); + Services.obs.addObserver(this, "http-on-stop-request"); + } + + /** + * Uninitializes the content handler. + */ + uninit() { + Services.obs.removeObserver(this, "http-on-examine-response"); + Services.obs.removeObserver(this, "http-on-examine-cached-response"); + Services.obs.removeObserver(this, "http-on-stop-request"); + } + + /** + * Test-only function to override the search provider information for use + * with tests. Passes it to the SearchTelemetryChild actor. + * + * @param {object} providerInfo @see SEARCH_PROVIDER_INFO for type information. + */ + overrideSearchTelemetryForTests(providerInfo) { + Services.ppmm.sharedData.set("SearchTelemetry:ProviderInfo", providerInfo); + } + + /** + * Reports bandwidth used by the given channel if it is used by search requests. + * + * @param {object} aChannel The channel that generated the activity. + */ + _reportChannelBandwidth(aChannel) { + if (!(aChannel instanceof Ci.nsIChannel)) { + return; + } + let wrappedChannel = ChannelWrapper.get(aChannel); + + let getTopURL = channel => { + // top-level document + if ( + channel.loadInfo && + channel.loadInfo.externalContentPolicyType == + Ci.nsIContentPolicy.TYPE_DOCUMENT + ) { + return channel.finalURL; + } + + // iframe + let frameAncestors; + try { + frameAncestors = channel.frameAncestors; + } catch (e) { + frameAncestors = null; + } + if (frameAncestors) { + let ancestor = frameAncestors.find(obj => obj.frameId == 0); + if (ancestor) { + return ancestor.url; + } + } + + // top-level resource + if (channel.loadInfo && channel.loadInfo.loadingPrincipal) { + return channel.loadInfo.loadingPrincipal.spec; + } + + return null; + }; + + let topUrl = getTopURL(wrappedChannel); + if (!topUrl) { + return; + } + + let info = this._checkURLForSerpMatch(topUrl); + if (!info) { + return; + } + + let bytesTransferred = + wrappedChannel.requestSize + wrappedChannel.responseSize; + let { provider } = info; + + let isPrivate = + wrappedChannel.loadInfo && + wrappedChannel.loadInfo.originAttributes.privateBrowsingId > 0; + if (isPrivate) { + provider += `-${SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX}`; + } + + Services.telemetry.keyedScalarAdd( + SEARCH_DATA_TRANSFERRED_SCALAR, + provider, + bytesTransferred + ); + } + + observe(aSubject, aTopic, aData) { + switch (aTopic) { + case "http-on-stop-request": + this._reportChannelBandwidth(aSubject); + break; + case "http-on-examine-response": + case "http-on-examine-cached-response": + this.observeActivity(aSubject); + break; + } + } + + /** + * Listener that observes network activity, so that we can determine if a link + * from a search provider page was followed, and if then if that link was an + * ad click or not. + * + * @param {nsIChannel} channel The channel that generated the activity. + */ + observeActivity(channel) { + if (!(channel instanceof Ci.nsIChannel)) { + return; + } + + let wrappedChannel = ChannelWrapper.get(channel); + // The channel we're observing might be a redirect of a channel we've + // observed before. + if (wrappedChannel._adClickRecorded) { + lazy.logConsole.debug("Ad click already recorded"); + return; + } + + Services.tm.dispatchToMainThread(() => { + // We suspect that No Content (204) responses are used to transfer or + // update beacons. They used to lead to double-counting ad-clicks, so let's + // ignore them. + if (wrappedChannel.statusCode == 204) { + lazy.logConsole.debug("Ignoring activity from ambiguous responses"); + return; + } + + // The wrapper is consistent across redirects, so we can use it to track state. + let originURL = wrappedChannel.originURI && wrappedChannel.originURI.spec; + let item = this._findBrowserItemForURL(originURL); + if (!originURL || !item) { + return; + } + + let url = wrappedChannel.finalURL; + + let providerInfo = item.info.provider; + let info = this._searchProviderInfo.find(provider => { + return provider.telemetryId == providerInfo; + }); + + // If an error occurs with Glean SERP telemetry logic, avoid + // disrupting legacy telemetry. + try { + this.#maybeRecordSERPTelemetry(wrappedChannel, item, info); + } catch (ex) { + lazy.logConsole.error(ex); + } + + if (!info.extraAdServersRegexps?.some(regex => regex.test(url))) { + return; + } + + try { + Services.telemetry.keyedScalarAdd( + SEARCH_AD_CLICKS_SCALAR_BASE + item.source, + `${info.telemetryId}:${item.info.type}`, + 1 + ); + wrappedChannel._adClickRecorded = true; + if (item.newtabSessionId) { + Glean.newtabSearchAd.click.record({ + newtab_visit_id: item.newtabSessionId, + search_access_point: item.source, + is_follow_on: item.info.type.endsWith("follow-on"), + is_tagged: item.info.type.startsWith("tagged"), + telemetry_id: item.info.provider, + }); + } + + lazy.logConsole.debug("Counting ad click in page for:", { + source: item.source, + originURL, + URL: url, + }); + } catch (e) { + console.error(e); + } + }); + } + + /** + * Checks if a request should record an ad click if it can be traced to a + * browser containing an observed SERP. + * + * @param {ChannelWrapper} wrappedChannel + * The wrapped channel. + * @param {object} item + * The browser item associated with the origin URL of the request. + * @param {object} info + * The search provider info associated with the item. + */ + #maybeRecordSERPTelemetry(wrappedChannel, item, info) { + if (!lazy.serpEventsEnabled) { + return; + } + + if (wrappedChannel._recordedClick) { + lazy.logConsole.debug("Click already recorded."); + return; + } + + let originURL = wrappedChannel.originURI?.spec; + let url = wrappedChannel.finalURL; + // Some channels re-direct by loading pages that return 200. The result + // is the channel will have an originURL that changes from the SERP to + // either a nonAdsRegexp or an extraAdServersRegexps. This is typical + // for loading a page in a new tab. The channel will have changed so any + // properties attached to them to record state (e.g. _recordedClick) + // won't be present. + if ( + info.nonAdsLinkRegexps.some(r => r.test(originURL)) || + info.extraAdServersRegexps.some(r => r.test(originURL)) + ) { + return; + } + + // A click event is recorded if a user loads a resource from an + // originURL that is a SERP. + // + // Typically, we only want top level loads containing documents to avoid + // recording any event on an in-page resource a SERP might load + // (e.g. CSS files). + // + // The exception to this is if a subframe loads a resource that matches + // a non ad link. Some SERPs encode non ad search results with a URL + // that gets loaded into an iframe, which then tells the container of + // the iframe to change the location of the page. + if ( + wrappedChannel.channel.isDocument && + (wrappedChannel.channel.loadInfo.isTopLevelLoad || + info.nonAdsLinkRegexps.some(r => r.test(url))) + ) { + let browser = wrappedChannel.browserElement; + + // If the load is from history, don't record an event. + if ( + browser?.browsingContext.webProgress?.loadType & + Ci.nsIDocShell.LOAD_CMD_HISTORY + ) { + lazy.logConsole.debug("Ignoring load from history"); + return; + } + + // Step 1: Check if the browser associated with the request was a + // tracked SERP. + let start = Cu.now(); + let telemetryState; + let isFromNewtab = false; + if (item.browserTelemetryStateMap.has(browser)) { + // If the map contains the browser, then it means that the request is + // the SERP is going from one page to another. We know this because + // previous conditions prevent non-top level loads from occuring here. + telemetryState = item.browserTelemetryStateMap.get(browser); + } else if (browser) { + // Alternatively, it could be the case that the request is occuring in + // a new tab but was triggered by one of the browsers in the state map. + // If only one browser exists in the state map, it must be that one. + if (item.count === 1) { + let sourceBrowsers = ChromeUtils.nondeterministicGetWeakMapKeys( + item.browserTelemetryStateMap + ); + if (sourceBrowsers?.length) { + telemetryState = item.browserTelemetryStateMap.get( + sourceBrowsers[0] + ); + } + } else if (item.count > 1) { + // If the count is more than 1, then multiple open SERPs contain the + // same search term, so try to find the specific browser that opened + // the request. + let tabBrowser = browser.getTabBrowser(); + let tab = tabBrowser.getTabForBrowser(browser).openerTab; + // A tab will not always have an openerTab, as first tabs in new + // windows don't have an openerTab. + // Bug 1867582: We should also handle the case where multiple tabs + // contain the same search term. + if (tab) { + telemetryState = item.browserTelemetryStateMap.get( + tab.linkedBrowser + ); + } + } + if (telemetryState) { + isFromNewtab = true; + } + } + + // Step 2: If we have telemetryState, the browser object must be + // associated with another browser that is tracked. Try to find the + // component type on the SERP responsible for the request. + // Exceptions: + // - If a searchbox was used to initiate the load, don't record another + // engagement because the event was logged elsewhere. + // - If the ad impression hasn't been recorded yet, we have no way of + // knowing precisely what kind of component was selected. + let isSerp = false; + if ( + telemetryState && + telemetryState.adImpressionsReported && + !telemetryState.searchBoxSubmitted + ) { + if (info.searchPageRegexp?.test(originURL)) { + isSerp = true; + } + + let startFindComponent = Cu.now(); + let parsedUrl = new URL(url); + // Determine the component type of the link. + let type; + for (let [ + storedUrl, + componentType, + ] of telemetryState.urlToComponentMap.entries()) { + // The URL we're navigating to may have more query parameters if + // the provider adds query parameters when the user clicks on a link. + // On the other hand, the URL we are navigating to may have have + // fewer query parameters because of query param stripping. + // Thus, if a query parameter is missing, a match can still be made + // provided keys that exist in both URLs contain equal values. + let score = SearchSERPTelemetry.compareUrls(storedUrl, parsedUrl, { + paramValues: true, + path: true, + }); + if (score) { + type = componentType; + break; + } + } + ChromeUtils.addProfilerMarker( + "SearchSERPTelemetry._observeActivity", + startFindComponent, + "Find component for URL" + ); + + // Default value for URLs that don't match any components categorized + // on the page. + if (!type) { + type = SearchSERPTelemetryUtils.COMPONENTS.NON_ADS_LINK; + } + + if ( + type == SearchSERPTelemetryUtils.COMPONENTS.REFINED_SEARCH_BUTTONS + ) { + SearchSERPTelemetry.setBrowserContentSource( + browser, + SearchSERPTelemetryUtils.INCONTENT_SOURCES.REFINE_ON_SERP + ); + } else if (isSerp && isFromNewtab) { + SearchSERPTelemetry.setBrowserContentSource( + browser, + SearchSERPTelemetryUtils.INCONTENT_SOURCES.OPENED_IN_NEW_TAB + ); + } + + // Step 3: Record the engagement. + impressionIdsWithoutEngagementsSet.delete(telemetryState.impressionId); + if (AD_COMPONENTS.includes(type)) { + telemetryState.adsClicked += 1; + } + Glean.serp.engagement.record({ + impression_id: telemetryState.impressionId, + action: SearchSERPTelemetryUtils.ACTIONS.CLICKED, + target: type, + }); + lazy.logConsole.debug("Counting click:", { + impressionId: telemetryState.impressionId, + type, + URL: url, + }); + // Prevent re-directed channels from being examined more than once. + wrappedChannel._recordedClick = true; + } + ChromeUtils.addProfilerMarker( + "SearchSERPTelemetry._observeActivity", + start, + "Maybe record user engagement." + ); + } + } + + /** + * Logs telemetry for a page with adverts, if it is one of the partner search + * provider pages that we're tracking. + * + * @param {object} info + * The search provider information for the page. + * @param {boolean} info.hasAds + * Whether or not the page has adverts. + * @param {string} info.url + * The url of the page. + * @param {object} browser + * The browser associated with the page. + */ + _reportPageWithAds(info, browser) { + let item = this._findItemForBrowser(browser); + if (!item) { + lazy.logConsole.warn( + "Expected to report URI for", + info.url, + "with ads but couldn't find the information" + ); + return; + } + + let telemetryState = item.browserTelemetryStateMap.get(browser); + if (telemetryState.adsReported) { + lazy.logConsole.debug( + "Ad was previously reported for browser with URI", + info.url + ); + return; + } + + lazy.logConsole.debug( + "Counting ads in page for", + item.info.provider, + item.info.type, + item.source, + info.url + ); + Services.telemetry.keyedScalarAdd( + SEARCH_WITH_ADS_SCALAR_BASE + item.source, + `${item.info.provider}:${item.info.type}`, + 1 + ); + Services.obs.notifyObservers(null, "reported-page-with-ads"); + + telemetryState.adsReported = true; + + if (item.newtabSessionId) { + Glean.newtabSearchAd.impression.record({ + newtab_visit_id: item.newtabSessionId, + search_access_point: item.source, + is_follow_on: item.info.type.endsWith("follow-on"), + is_tagged: item.info.type.startsWith("tagged"), + telemetry_id: item.info.provider, + }); + } + } + + /** + * Logs ad impression telemetry for a page with adverts, if it is + * one of the partner search provider pages that we're tracking. + * + * @param {object} info + * The search provider information for the page. + * @param {string} info.url + * The url of the page. + * @param {Map<string, object>} info.adImpressions + * A map of ad impressions found for the page, where the key + * is the type of ad component and the value is an object + * containing the number of ads that were loaded, visible, + * and hidden. + * @param {Map<string, string>} info.hrefToComponentMap + * A map of hrefs to their component type. Contains both ads + * and non-ads. + * @param {object} browser + * The browser associated with the page. + */ + _reportPageWithAdImpressions(info, browser) { + let item = this._findItemForBrowser(browser); + if (!item) { + return; + } + let telemetryState = item.browserTelemetryStateMap.get(browser); + if ( + lazy.serpEventsEnabled && + info.adImpressions && + telemetryState && + !telemetryState.adImpressionsReported + ) { + for (let [componentType, data] of info.adImpressions.entries()) { + telemetryState.adsVisible += data.adsVisible; + + lazy.logConsole.debug("Counting ad:", { type: componentType, ...data }); + Glean.serp.adImpression.record({ + impression_id: telemetryState.impressionId, + component: componentType, + ads_loaded: data.adsLoaded, + ads_visible: data.adsVisible, + ads_hidden: data.adsHidden, + }); + } + // Convert hrefToComponentMap to a urlToComponentMap in order to cache + // the query parameters of the href. + let urlToComponentMap = new Map(); + for (let [href, adType] of info.hrefToComponentMap) { + urlToComponentMap.set(new URL(href), adType); + } + telemetryState.urlToComponentMap = urlToComponentMap; + telemetryState.adImpressionsReported = true; + Services.obs.notifyObservers(null, "reported-page-with-ad-impressions"); + } + } + + /** + * Records a page action from a SERP page. Normally, actions are tracked in + * parent process by observing network events but some actions are not + * possible to detect outside of subscribing to the child process. + * + * @param {object} info + * The search provider infomation for the page. + * @param {string} info.type + * The component type that was clicked on. + * @param {string} info.action + * The action taken on the page. + * @param {object} browser + * The browser associated with the page. + */ + _reportPageAction(info, browser) { + let item = this._findItemForBrowser(browser); + if (!item) { + return; + } + let telemetryState = item.browserTelemetryStateMap.get(browser); + let impressionId = telemetryState?.impressionId; + if (info.type && impressionId) { + lazy.logConsole.debug(`Recorded page action:`, { + impressionId: telemetryState.impressionId, + type: info.type, + action: info.action, + }); + Glean.serp.engagement.record({ + impression_id: impressionId, + action: info.action, + target: info.type, + }); + impressionIdsWithoutEngagementsSet.delete(impressionId); + // In-content searches are not be categorized with a type, so they will + // not be picked up in the network processes. + if ( + info.type == SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX && + info.action == SearchSERPTelemetryUtils.ACTIONS.SUBMITTED + ) { + telemetryState.searchBoxSubmitted = true; + SearchSERPTelemetry.setBrowserContentSource( + browser, + SearchSERPTelemetryUtils.INCONTENT_SOURCES.SEARCHBOX + ); + } + } else { + lazy.logConsole.warn( + "Expected to report a", + info.action, + "engagement for", + info.url, + "but couldn't find an impression id." + ); + } + } + + _reportPageImpression(info, browser) { + let item = this._findItemForBrowser(browser); + let telemetryState = item.browserTelemetryStateMap.get(browser); + if (!telemetryState?.impressionInfo) { + lazy.logConsole.debug( + "Could not find telemetry state or impression info." + ); + return; + } + let impressionId = telemetryState.impressionId; + if (impressionId) { + let impressionInfo = telemetryState.impressionInfo; + Glean.serp.impression.record({ + impression_id: impressionId, + provider: impressionInfo.provider, + tagged: impressionInfo.tagged, + partner_code: impressionInfo.partnerCode, + source: impressionInfo.source, + shopping_tab_displayed: info.shoppingTabDisplayed, + is_shopping_page: impressionInfo.isShoppingPage, + is_private: impressionInfo.isPrivate, + }); + lazy.logConsole.debug(`Reported Impression:`, { + impressionId, + ...impressionInfo, + shoppingTabDisplayed: info.shoppingTabDisplayed, + }); + Services.obs.notifyObservers(null, "reported-page-with-impression"); + } else { + lazy.logConsole.debug("Could not find an impression id."); + } + } + + /** + * Initiates the categorization and reporting of domains extracted from + * SERPs. + * + * @param {object} info + * The search provider infomation for the page. + * @param {Set} info.nonAdDomains + The non-ad domains extracted from the page. + * @param {Set} info.adDomains + The ad domains extracted from the page. + * @param {object} browser + * The browser associated with the page. + */ + _reportPageDomains(info, browser) { + let item = this._findItemForBrowser(browser); + let telemetryState = item.browserTelemetryStateMap.get(browser); + if (lazy.serpEventTelemetryCategorization && telemetryState) { + let result = SearchSERPCategorization.maybeCategorizeSERP( + info.nonAdDomains, + info.adDomains, + item.info.provider + ); + if (result) { + telemetryState.categorizationInfo = result; + let callback = () => { + let impressionInfo = telemetryState.impressionInfo; + SERPCategorizationRecorder.recordCategorizationTelemetry({ + ...telemetryState.categorizationInfo, + app_version: item.majorVersion, + channel: item.channel, + region: item.region, + partner_code: impressionInfo.partnerCode, + provider: impressionInfo.provider, + tagged: impressionInfo.tagged, + num_ads_clicked: telemetryState.adsClicked, + num_ads_visible: telemetryState.adsVisible, + }); + }; + SearchSERPCategorizationEventScheduler.addCallback(browser, callback); + } + } + Services.obs.notifyObservers( + null, + "reported-page-with-categorized-domains" + ); + } +} + +/** + * @typedef {object} CategorizationResult + * @property {string} organic_category + * The category for the organic result. + * @property {number} organic_num_domains + * The number of domains examined to determine the organic category result. + * @property {number} organic_num_inconclusive + * The number of inconclusive domains when determining the organic result. + * @property {number} organic_num_unknown + * The number of unknown domains when determining the organic result. + * @property {string} sponsored_category + * The category for the organic result. + * @property {number} sponsored_num_domains + * The number of domains examined to determine the sponsored category. + * @property {number} sponsored_num_inconclusive + * The number of inconclusive domains when determining the sponsored category. + * @property {number} sponsored_num_unknown + * The category for the sponsored result. + * @property {string} mappings_version + * The category mapping version used to determine the categories. + */ + +/** + * @typedef {object} CategorizationExtraParams + * @property {number} num_ads_clicked + * The total number of ads clicked on a SERP. + * @property {number} num_ads_visible + * The total number of ads visible to the user when categorization occured. + */ + +/* eslint-disable jsdoc/valid-types */ +/** + * @typedef {CategorizationResult & CategorizationExtraParams} RecordCategorizationParameters + */ +/* eslint-enable jsdoc/valid-types */ + +/** + * Categorizes SERPs. + */ +class SERPCategorizer { + /** + * Categorizes domains extracted from SERPs. Note that we don't process + * domains if the domain-to-categories map is empty (if the client couldn't + * download Remote Settings attachments, for example). + * + * @param {Set} nonAdDomains + * Domains from organic results extracted from the page. + * @param {Set} adDomains + * Domains from ad results extracted from the page. + * @param {string} provider + * The provider associated with the page. + * @returns {CategorizationResult | null} + * The final categorization result. Returns null if the map was empty. + */ + maybeCategorizeSERP(nonAdDomains, adDomains, provider) { + // Per DS, if the map was empty (e.g. because of a technical issue + // downloading the data), we shouldn't report telemetry. + // Thus, there is no point attempting to categorize the SERP. + if (SearchSERPDomainToCategoriesMap.empty) { + return null; + } + let resultsToReport = {}; + + let processedDomains = this.processDomains(nonAdDomains, provider); + let results = this.applyCategorizationLogic(processedDomains); + resultsToReport.organic_category = results.category; + resultsToReport.organic_num_domains = results.num_domains; + resultsToReport.organic_num_unknown = results.num_unknown; + resultsToReport.organic_num_inconclusive = results.num_inconclusive; + + processedDomains = this.processDomains(adDomains, provider); + results = this.applyCategorizationLogic(processedDomains); + resultsToReport.sponsored_category = results.category; + resultsToReport.sponsored_num_domains = results.num_domains; + resultsToReport.sponsored_num_unknown = results.num_unknown; + resultsToReport.sponsored_num_inconclusive = results.num_inconclusive; + + resultsToReport.mappings_version = SearchSERPDomainToCategoriesMap.version; + + return resultsToReport; + } + + /** + * Applies the logic for reducing extracted domains to a single category for + * the SERP. + * + * @param {Set} domains + * The domains extracted from the page. + * @returns {object} resultsToReport + * The final categorization results. Keys are: "category", "num_domains", + * "num_unknown" and "num_inconclusive". + */ + applyCategorizationLogic(domains) { + let domainInfo = {}; + let domainsCount = 0; + let unknownsCount = 0; + let inconclusivesCount = 0; + + // Per a request from Data Science, we need to limit the number of domains + // categorized to 10 non-ad domains and 10 ad domains. + domains = new Set( + [...domains].slice(0, CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE) + ); + + for (let domain of domains) { + domainsCount++; + + let categoryCandidates = SearchSERPDomainToCategoriesMap.get(domain); + + if (!categoryCandidates.length) { + unknownsCount++; + continue; + } + + // Inconclusive domains do not have more than one category candidate. + if ( + categoryCandidates[0].category == + SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE + ) { + inconclusivesCount++; + continue; + } + + domainInfo[domain] = categoryCandidates; + } + + let finalCategory; + let topCategories = []; + // Determine if all domains were unknown or inconclusive. + if (unknownsCount + inconclusivesCount == domainsCount) { + finalCategory = SearchSERPTelemetryUtils.CATEGORIZATION.INCONCLUSIVE; + } else { + let maxScore = CATEGORIZATION_SETTINGS.MINIMUM_SCORE; + let rank = CATEGORIZATION_SETTINGS.STARTING_RANK; + for (let categoryCandidates of Object.values(domainInfo)) { + for (let { category, score } of categoryCandidates) { + let adjustedScore = score / Math.log2(rank); + if (adjustedScore > maxScore) { + maxScore = adjustedScore; + topCategories = [category]; + } else if (adjustedScore == maxScore) { + topCategories.push(Number(category)); + } + rank++; + } + } + finalCategory = + topCategories.length > 1 + ? this.#chooseRandomlyFrom(topCategories) + : topCategories[0]; + } + + return { + category: finalCategory, + num_domains: domainsCount, + num_unknown: unknownsCount, + num_inconclusive: inconclusivesCount, + }; + } + + /** + * Processes raw domains extracted from the SERP into their final form before + * categorization. + * + * @param {Set} domains + * The domains extracted from the page. + * @param {string} provider + * The provider associated with the page. + * @returns {Set} processedDomains + * The final set of processed domains for a page. + */ + processDomains(domains, provider) { + let processedDomains = new Set(); + + for (let domain of domains) { + // Don't include domains associated with the search provider. + if ( + domain.startsWith(`${provider}.`) || + domain.includes(`.${provider}.`) + ) { + continue; + } + let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain); + // We may have come across the same domain twice, once with www. prefixed + // and another time without. + if ( + domainWithoutSubdomains && + !processedDomains.has(domainWithoutSubdomains) + ) { + processedDomains.add(domainWithoutSubdomains); + } + } + + return processedDomains; + } + + /** + * Helper to strip domains of any subdomains. + * + * @param {string} domain + * The domain to strip of any subdomains. + * @returns {object} browser + * The given domain with any subdomains removed. + */ + #stripDomainOfSubdomains(domain) { + let tld; + // Can throw an exception if the input has too few domain levels. + try { + tld = Services.eTLD.getKnownPublicSuffixFromHost(domain); + } catch (ex) { + return ""; + } + + let domainWithoutTLD = domain.substring(0, domain.length - tld.length); + let secondLevelDomain = domainWithoutTLD.split(".").at(-2); + + return secondLevelDomain ? `${secondLevelDomain}.${tld}` : ""; + } + + #chooseRandomlyFrom(categories) { + let randIdx = Math.floor(Math.random() * categories.length); + return categories[randIdx]; + } +} + +/** + * Contains outstanding categorizations of browser objects that have yet to be + * scheduled to be reported into a Glean event. + * They are kept here until one of the conditions are met: + * 1. The browser that was tracked is no longer being tracked. + * 2. A user has been idle for IDLE_TIMEOUT_SECONDS + * 3. The user has awoken their computer and the time elapsed from the last + * categorization event exceeds WAKE_TIMEOUT_MS. + */ +class CategorizationEventScheduler { + /** + * A WeakMap containing browser objects mapped to a callback. + * + * @type {WeakMap | null} + */ + #browserToCallbackMap = null; + + /** + * An instance of user idle service. Cached for testing purposes. + * + * @type {nsIUserIdleService | null} + */ + #idleService = null; + + /** + * Whether it has been initialized. + * + * @type {boolean} + */ + #init = false; + + /** + * The last Date.now() of a callback insertion. + * + * @type {number | null} + */ + #mostRecentMs = null; + + constructor() { + this.init(); + } + + init() { + if (!lazy.serpEventTelemetryCategorization || this.#init) { + return; + } + + lazy.logConsole.debug("Initializing categorization event scheduler."); + + this.#browserToCallbackMap = new WeakMap(); + + // In tests, we simulate idleness as it is more reliable and easier than + // trying to replicate idleness. The way to do is so it by creating + // an mock idle service and having the component subscribe to it. If we + // used a lazy instantiation of idle service, the test could only ever be + // subscribed to the real one. + this.#idleService = Cc["@mozilla.org/widget/useridleservice;1"].getService( + Ci.nsIUserIdleService + ); + + this.#idleService.addIdleObserver( + this, + CATEGORIZATION_SETTINGS.IDLE_TIMEOUT_SECONDS + ); + + Services.obs.addObserver(this, "quit-application"); + Services.obs.addObserver(this, "wake_notification"); + + this.#init = true; + } + + uninit() { + if (!this.#init) { + return; + } + + this.#browserToCallbackMap = null; + + lazy.logConsole.debug("Un-initializing categorization event scheduler."); + this.#idleService.removeIdleObserver( + this, + CATEGORIZATION_SETTINGS.IDLE_TIMEOUT_SECONDS + ); + + Services.obs.removeObserver(this, "quit-application"); + Services.obs.removeObserver(this, "wake_notification"); + + this.#idleService = null; + this.#init = false; + } + + observe(subject, topic, data) { + switch (topic) { + case "idle": + lazy.logConsole.debug("Triggering all callbacks due to idle."); + this.#sendAllCallbacks(); + break; + case "quit-application": + this.uninit(); + break; + case "wake_notification": + if ( + this.#mostRecentMs && + Date.now() - this.#mostRecentMs >= + CATEGORIZATION_SETTINGS.WAKE_TIMEOUT_MS + ) { + lazy.logConsole.debug( + "Triggering all callbacks due to a wake notification." + ); + this.#sendAllCallbacks(); + } + break; + } + } + + addCallback(browser, callback) { + lazy.logConsole.debug("Adding callback to queue."); + this.#mostRecentMs = Date.now(); + this.#browserToCallbackMap?.set(browser, callback); + } + + sendCallback(browser) { + let callback = this.#browserToCallbackMap?.get(browser); + if (callback) { + lazy.logConsole.debug("Triggering callback."); + callback(); + Services.obs.notifyObservers( + null, + "recorded-single-categorization-event" + ); + this.#browserToCallbackMap.delete(browser); + } + } + + #sendAllCallbacks() { + let browsers = ChromeUtils.nondeterministicGetWeakMapKeys( + this.#browserToCallbackMap + ); + if (browsers) { + lazy.logConsole.debug("Triggering all callbacks."); + for (let browser of browsers) { + this.sendCallback(browser); + } + } + this.#mostRecentMs = null; + Services.obs.notifyObservers(null, "recorded-all-categorization-events"); + } +} + +/** + * Handles reporting SERP categorization telemetry to Glean. + */ +class CategorizationRecorder { + /** + * Helper function for recording the SERP categorization event. + * + * @param {RecordCategorizationParameters} resultToReport + * The object containing all the data required to report. + */ + recordCategorizationTelemetry(resultToReport) { + lazy.logConsole.debug( + "Reporting the following categorization result:", + resultToReport + ); + // TODO: Bug 1868476 - Report result to Glean. + } +} + +/** + * @typedef {object} DomainToCategoriesRecord + * @property {number} version + * The version of the record. + */ + +/** + * @typedef {object} DomainCategoryScore + * @property {number} category + * The index of the category. + * @property {number} score + * The score associated with the category. + */ + +/** + * Maps domain to categories, with data synced with Remote Settings. + */ +class DomainToCategoriesMap { + /** + * Contains the domain to category scores. + * + * @type {Object<string, Array<DomainCategoryScore>> | null} + */ + #map = null; + + /** + * Latest version number of the attachments. + * + * @type {number | null} + */ + #version = null; + + /** + * The Remote Settings client. + * + * @type {object | null} + */ + #client = null; + + /** + * Whether this is synced with Remote Settings. + * + * @type {boolean} + */ + #init = false; + + /** + * Callback when Remote Settings syncs. + * + * @type {Function | null} + */ + #onSettingsSync = null; + + /** + * When downloading an attachment from Remote Settings fails, this will + * contain a timer which will eventually attempt to retry downloading + * attachments. + */ + #downloadTimer = null; + + /** + * Number of times this has attempted to try another download. Will reset + * if the categorization preference has been toggled, or a sync event has + * been detected. + * + * @type {number} + */ + #downloadRetries = 0; + + /** + * Runs at application startup with startup idle tasks. If the SERP + * categorization preference is enabled, it creates a Remote Settings + * client to listen to updates, and populates the map. + */ + async init() { + if (!lazy.serpEventTelemetryCategorization || this.#init) { + return; + } + lazy.logConsole.debug("Initializing domain-to-categories map."); + this.#setupClientAndMap(); + this.#init = true; + } + + uninit() { + if (this.#init) { + lazy.logConsole.debug("Un-initializing domain-to-categories map."); + this.#clearClientAndMap(); + this.#cancelAndNullifyTimer(); + this.#init = false; + } + } + + /** + * Given a domain, find categories and relevant scores. + * + * @param {string} domain Domain to lookup. + * @returns {Array<DomainCategoryScore>} + * An array containing categories and their respective score. If no record + * for the domain is available, return an empty array. + */ + get(domain) { + if (this.empty) { + return []; + } + lazy.gCryptoHash.init(lazy.gCryptoHash.MD5); + let bytes = new TextEncoder().encode(domain); + lazy.gCryptoHash.update(bytes, domain.length); + let hash = lazy.gCryptoHash.finish(true); + let rawValues = this.#map[hash] ?? []; + if (rawValues.length) { + let output = []; + // Transform data into a more readable format. + // [x, y] => { category: x, score: y } + for (let i = 0; i < rawValues.length; i += 2) { + output.push({ category: rawValues[i], score: rawValues[i + 1] }); + } + return output; + } + return []; + } + + /** + * If the map was initialized, returns the version number for the data. + * The version number is determined by the record with the highest version + * number. Even if the records have different versions, only records from the + * latest version should be available. Returns null if the map was not + * initialized. + * + * @returns {null | number} The version number. + */ + get version() { + return this.#version; + } + + /** + * Whether the map is empty of data. + * + * @returns {boolean} + */ + get empty() { + return !this.#map; + } + + /** + * Unit test-only function, used to override the domainToCategoriesMap so + * that tests can set it to easy to test values. + * + * @param {object} domainToCategoriesMap + * An object where the key is a hashed domain and the value is an array + * containing an arbitrary number of DomainCategoryScores. + */ + overrideMapForTests(domainToCategoriesMap) { + this.#map = domainToCategoriesMap; + } + + async #setupClientAndMap() { + if (this.#client && !this.empty) { + return; + } + lazy.logConsole.debug("Setting up domain-to-categories map."); + this.#client = lazy.RemoteSettings(TELEMETRY_CATEGORIZATION_KEY); + + this.#onSettingsSync = event => this.#sync(event.data); + this.#client.on("sync", this.#onSettingsSync); + + let records = await this.#client.get(); + await this.#clearAndPopulateMap(records); + } + + #clearClientAndMap() { + if (this.#client) { + lazy.logConsole.debug("Removing Remote Settings client."); + this.#client.off("sync", this.#onSettingsSync); + this.#client = null; + this.#onSettingsSync = null; + this.#downloadRetries = 0; + } + + if (this.#map) { + lazy.logConsole.debug("Clearing domain-to-categories map."); + this.#map = null; + this.#version = null; + } + } + + /** + * Inspects a list of records from the categorization domain bucket and finds + * the maximum version score from the set of records. Each record should have + * the same version number but if for any reason one entry has a lower + * version number, the latest version can be used to filter it out. + * + * @param {Array<DomainToCategoriesRecord>} records + * An array containing the records from a Remote Settings collection. + * @returns {number} + */ + #retrieveLatestVersion(records) { + return records.reduce((version, record) => { + if (record.version > version) { + return record.version; + } + return version; + }, 0); + } + + /** + * Callback when Remote Settings has indicated the collection has been + * synced. Since the records in the collection will be updated all at once, + * use the array of current records which at this point in time would have + * the latest records from Remote Settings. Additionally, delete any + * attachment for records that no longer exist. + * + * @param {object} data + * Object containing records that are current, deleted, created, or updated. + * + */ + async #sync(data) { + lazy.logConsole.debug("Syncing domain-to-categories with Remote Settings."); + + // Remove local files of deleted records. + let toDelete = data?.deleted.filter(d => d.attachment); + await Promise.all( + toDelete.map(record => this.#client.attachments.deleteDownloaded(record)) + ); + + // In case a user encountered network failures in the past and kept their + // session on, this will ensure the next sync event will retry downloading + // again in case there's a new download error. + this.#downloadRetries = 0; + + this.#clearAndPopulateMap(data?.current); + } + + /** + * Clear the existing map and populate it with attachments found in the + * records. If no attachments are found, or no record containing an + * attachment contained the latest version, then nothing will change. + * + * @param {Array<DomainToCategoriesRecord>} records + * The records containing attachments. + * + */ + async #clearAndPopulateMap(records) { + // Set map to null so that if there are errors in the downloads, consumers + // will be able to know whether the map has information. Once we've + // successfully downloaded attachments and are parsing them, a non-null + // object will be created. + this.#map = null; + this.#version = null; + this.#cancelAndNullifyTimer(); + + if (!records?.length) { + lazy.logConsole.debug("No records found for domain-to-categories map."); + return; + } + + let fileContents = []; + for (let record of records) { + let result; + // Downloading attachments can fail. + try { + result = await this.#client.attachments.download(record); + } catch (ex) { + lazy.logConsole.error("Could not download file:", ex); + this.#createTimerToPopulateMap(); + return; + } + fileContents.push(result.buffer); + } + + // All attachments should have the same version number. If for whatever + // reason they don't, we should only use the attachments with the latest + // version. + this.#version = this.#retrieveLatestVersion(records); + + if (!this.#version) { + lazy.logConsole.debug("Could not find a version number for any record."); + return; + } + + // Queue the series of assignments. + for (let i = 0; i < fileContents.length; ++i) { + let buffer = fileContents[i]; + Services.tm.idleDispatchToMainThread(() => { + let start = Cu.now(); + let json; + try { + json = JSON.parse(new TextDecoder().decode(buffer)); + } catch (ex) { + // TODO: If there was an error decoding the buffer, we may want to + // dispatch an error in telemetry or try again. + return; + } + ChromeUtils.addProfilerMarker( + "SearchSERPTelemetry.#clearAndPopulateMap", + start, + "Convert buffer to JSON." + ); + if (!this.#map) { + this.#map = {}; + } + Object.assign(this.#map, json); + lazy.logConsole.debug("Updated domain-to-categories map."); + if (i == fileContents.length - 1) { + Services.obs.notifyObservers( + null, + "domain-to-categories-map-update-complete" + ); + } + }); + } + } + + #cancelAndNullifyTimer() { + if (this.#downloadTimer) { + lazy.logConsole.debug("Cancel and nullify download timer."); + this.#downloadTimer.cancel(); + this.#downloadTimer = null; + } + } + + #createTimerToPopulateMap() { + if ( + this.#downloadRetries >= + TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.maxTriesPerSession + ) { + return; + } + if (!this.#downloadTimer) { + this.#downloadTimer = Cc["@mozilla.org/timer;1"].createInstance( + Ci.nsITimer + ); + } + lazy.logConsole.debug("Create timer to retry downloading attachments."); + let delay = + TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.base + + randomInteger( + TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.minAdjust, + TELEMETRY_CATEGORIZATION_DOWNLOAD_SETTINGS.maxAdjust + ); + this.#downloadTimer.initWithCallback( + async () => { + this.#downloadRetries += 1; + let records = await this.#client.get(); + this.#clearAndPopulateMap(records); + }, + delay, + Ci.nsITimer.TYPE_ONE_SHOT + ); + } +} + +function randomInteger(min, max) { + return Math.floor(Math.random() * (max - min + 1)) + min; +} + +export var SearchSERPDomainToCategoriesMap = new DomainToCategoriesMap(); +export var SearchSERPTelemetry = new TelemetryHandler(); +export var SearchSERPCategorization = new SERPCategorizer(); +export var SERPCategorizationRecorder = new CategorizationRecorder(); +export var SearchSERPCategorizationEventScheduler = + new CategorizationEventScheduler(); |