diff options
Diffstat (limited to 'browser/components/search/SearchSERPTelemetry.sys.mjs')
-rw-r--r-- | browser/components/search/SearchSERPTelemetry.sys.mjs | 305 |
1 files changed, 148 insertions, 157 deletions
diff --git a/browser/components/search/SearchSERPTelemetry.sys.mjs b/browser/components/search/SearchSERPTelemetry.sys.mjs index 00105241bb..fa593be08c 100644 --- a/browser/components/search/SearchSERPTelemetry.sys.mjs +++ b/browser/components/search/SearchSERPTelemetry.sys.mjs @@ -7,6 +7,7 @@ import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; const lazy = {}; ChromeUtils.defineESModuleGetters(lazy, { + BasePromiseWorker: "resource://gre/modules/PromiseWorker.sys.mjs", BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs", PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs", Region: "resource://gre/modules/Region.sys.mjs", @@ -94,6 +95,10 @@ XPCOMUtils.defineLazyPreferenceGetter( export const SearchSERPTelemetryUtils = { ACTIONS: { CLICKED: "clicked", + // specific to cookie banner + CLICKED_ACCEPT: "clicked_accept", + CLICKED_REJECT: "clicked_reject", + CLICKED_MORE_OPTIONS: "clicked_more_options", EXPANDED: "expanded", SUBMITTED: "submitted", }, @@ -103,6 +108,7 @@ export const SearchSERPTelemetryUtils = { AD_LINK: "ad_link", AD_SIDEBAR: "ad_sidebar", AD_SITELINK: "ad_sitelink", + COOKIE_BANNER: "cookie_banner", INCONTENT_SEARCHBOX: "incontent_searchbox", NON_ADS_LINK: "non_ads_link", REFINED_SEARCH_BUTTONS: "refined_search_buttons", @@ -403,6 +409,10 @@ class TelemetryHandler { ); } + newProvider.ignoreLinkRegexps = provider.ignoreLinkRegexps?.length + ? provider.ignoreLinkRegexps.map(r => new RegExp(r)) + : []; + newProvider.nonAdsLinkRegexps = provider.nonAdsLinkRegexps?.length ? provider.nonAdsLinkRegexps.map(r => new RegExp(r)) : []; @@ -412,6 +422,9 @@ class TelemetryHandler { regexp: new RegExp(provider.shoppingTab.regexp), }; } + + newProvider.nonAdsLinkQueryParamNames = + provider.nonAdsLinkQueryParamNames ?? []; return newProvider; }); this._contentHandler._searchProviderInfo = this._searchProviderInfo; @@ -429,8 +442,8 @@ class TelemetryHandler { this._contentHandler._reportPageWithAdImpressions(info, browser); } - reportPageDomains(info, browser) { - this._contentHandler._reportPageDomains(info, browser); + async reportPageDomains(info, browser) { + await this._contentHandler._reportPageDomains(info, browser); } reportPageImpression(info, browser) { @@ -1212,7 +1225,7 @@ class ContentHandler { ); } - observe(aSubject, aTopic, aData) { + observe(aSubject, aTopic) { switch (aTopic) { case "http-on-stop-request": this._reportChannelBandwidth(aSubject); @@ -1330,6 +1343,11 @@ class ContentHandler { let originURL = wrappedChannel.originURI?.spec; let url = wrappedChannel.finalURL; + + if (info.ignoreLinkRegexps.some(r => r.test(url))) { + return; + } + // Some channels re-direct by loading pages that return 200. The result // is the channel will have an originURL that changes from the SERP to // either a nonAdsRegexp or an extraAdServersRegexps. This is typical @@ -1434,6 +1452,30 @@ class ContentHandler { let startFindComponent = Cu.now(); let parsedUrl = new URL(url); + + // Organic links may contain query param values mapped to links shown + // on the SERP at page load. If a stored component depends on that + // value, we need to be able to recover it or else we'll always consider + // it a non_ads_link. + if ( + info.nonAdsLinkQueryParamNames.length && + info.nonAdsLinkRegexps.some(r => r.test(url)) + ) { + let newParsedUrl; + for (let key of info.nonAdsLinkQueryParamNames) { + let paramValue = parsedUrl.searchParams.get(key); + if (paramValue) { + try { + newParsedUrl = /^https?:\/\//.test(paramValue) + ? new URL(paramValue) + : new URL(paramValue, parsedUrl.origin); + break; + } catch (e) {} + } + } + parsedUrl = newParsedUrl ?? parsedUrl; + } + // Determine the component type of the link. let type; for (let [ @@ -1629,8 +1671,8 @@ class ContentHandler { * * @param {object} info * The search provider infomation for the page. - * @param {string} info.type - * The component type that was clicked on. + * @param {string} info.target + * The target component that was interacted with. * @param {string} info.action * The action taken on the page. * @param {object} browser @@ -1643,22 +1685,23 @@ class ContentHandler { } let telemetryState = item.browserTelemetryStateMap.get(browser); let impressionId = telemetryState?.impressionId; - if (info.type && impressionId) { + if (info.target && impressionId) { lazy.logConsole.debug(`Recorded page action:`, { impressionId: telemetryState.impressionId, - type: info.type, + target: info.target, action: info.action, }); Glean.serp.engagement.record({ impression_id: impressionId, action: info.action, - target: info.type, + target: info.target, }); impressionIdsWithoutEngagementsSet.delete(impressionId); // In-content searches are not be categorized with a type, so they will // not be picked up in the network processes. if ( - info.type == SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX && + info.target == + SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX && info.action == SearchSERPTelemetryUtils.ACTIONS.SUBMITTED ) { telemetryState.searchBoxSubmitted = true; @@ -1667,6 +1710,7 @@ class ContentHandler { SearchSERPTelemetryUtils.INCONTENT_SOURCES.SEARCHBOX ); } + Services.obs.notifyObservers(null, "reported-page-with-action"); } else { lazy.logConsole.warn( "Expected to report a", @@ -1712,23 +1756,23 @@ class ContentHandler { } /** - * Initiates the categorization and reporting of domains extracted from - * SERPs. - * - * @param {object} info - * The search provider infomation for the page. - * @param {Set} info.nonAdDomains - The non-ad domains extracted from the page. - * @param {Set} info.adDomains - The ad domains extracted from the page. - * @param {object} browser - * The browser associated with the page. - */ - _reportPageDomains(info, browser) { + * Initiates the categorization and reporting of domains extracted from + * SERPs. + * + * @param {object} info + * The search provider infomation for the page. + * @param {Set} info.nonAdDomains + The non-ad domains extracted from the page. + * @param {Set} info.adDomains + The ad domains extracted from the page. + * @param {object} browser + * The browser associated with the page. + */ + async _reportPageDomains(info, browser) { let item = this._findItemForBrowser(browser); let telemetryState = item.browserTelemetryStateMap.get(browser); if (lazy.serpEventTelemetryCategorization && telemetryState) { - let result = SearchSERPCategorization.maybeCategorizeSERP( + let result = await SearchSERPCategorization.maybeCategorizeSERP( info.nonAdDomains, info.adDomains, item.info.provider @@ -1808,12 +1852,10 @@ class SERPCategorizer { * Domains from organic results extracted from the page. * @param {Set} adDomains * Domains from ad results extracted from the page. - * @param {string} provider - * The provider associated with the page. * @returns {CategorizationResult | null} * The final categorization result. Returns null if the map was empty. */ - maybeCategorizeSERP(nonAdDomains, adDomains, provider) { + async maybeCategorizeSERP(nonAdDomains, adDomains) { // Per DS, if the map was empty (e.g. because of a technical issue // downloading the data), we shouldn't report telemetry. // Thus, there is no point attempting to categorize the SERP. @@ -1822,15 +1864,13 @@ class SERPCategorizer { } let resultsToReport = {}; - let processedDomains = this.processDomains(nonAdDomains, provider); - let results = this.applyCategorizationLogic(processedDomains); + let results = await this.applyCategorizationLogic(nonAdDomains); resultsToReport.organic_category = results.category; resultsToReport.organic_num_domains = results.num_domains; resultsToReport.organic_num_unknown = results.num_unknown; resultsToReport.organic_num_inconclusive = results.num_inconclusive; - processedDomains = this.processDomains(adDomains, provider); - results = this.applyCategorizationLogic(processedDomains); + results = await this.applyCategorizationLogic(adDomains); resultsToReport.sponsored_category = results.category; resultsToReport.sponsored_num_domains = results.num_domains; resultsToReport.sponsored_num_unknown = results.num_unknown; @@ -1851,22 +1891,18 @@ class SERPCategorizer { * The final categorization results. Keys are: "category", "num_domains", * "num_unknown" and "num_inconclusive". */ - applyCategorizationLogic(domains) { + async applyCategorizationLogic(domains) { let domainInfo = {}; let domainsCount = 0; let unknownsCount = 0; let inconclusivesCount = 0; - // Per a request from Data Science, we need to limit the number of domains - // categorized to 10 non-ad domains and 10 ad domains. - domains = new Set( - [...domains].slice(0, CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE) - ); - for (let domain of domains) { domainsCount++; - let categoryCandidates = SearchSERPDomainToCategoriesMap.get(domain); + let categoryCandidates = await SearchSERPDomainToCategoriesMap.get( + domain + ); if (!categoryCandidates.length) { unknownsCount++; @@ -1919,65 +1955,6 @@ class SERPCategorizer { }; } - /** - * Processes raw domains extracted from the SERP into their final form before - * categorization. - * - * @param {Set} domains - * The domains extracted from the page. - * @param {string} provider - * The provider associated with the page. - * @returns {Set} processedDomains - * The final set of processed domains for a page. - */ - processDomains(domains, provider) { - let processedDomains = new Set(); - - for (let domain of domains) { - // Don't include domains associated with the search provider. - if ( - domain.startsWith(`${provider}.`) || - domain.includes(`.${provider}.`) - ) { - continue; - } - let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain); - // We may have come across the same domain twice, once with www. prefixed - // and another time without. - if ( - domainWithoutSubdomains && - !processedDomains.has(domainWithoutSubdomains) - ) { - processedDomains.add(domainWithoutSubdomains); - } - } - - return processedDomains; - } - - /** - * Helper to strip domains of any subdomains. - * - * @param {string} domain - * The domain to strip of any subdomains. - * @returns {object} browser - * The given domain with any subdomains removed. - */ - #stripDomainOfSubdomains(domain) { - let tld; - // Can throw an exception if the input has too few domain levels. - try { - tld = Services.eTLD.getKnownPublicSuffixFromHost(domain); - } catch (ex) { - return ""; - } - - let domainWithoutTLD = domain.substring(0, domain.length - tld.length); - let secondLevelDomain = domainWithoutTLD.split(".").at(-2); - - return secondLevelDomain ? `${secondLevelDomain}.${tld}` : ""; - } - #chooseRandomlyFrom(categories) { let randIdx = Math.floor(Math.random() * categories.length); return categories[randIdx]; @@ -2075,7 +2052,7 @@ class CategorizationEventScheduler { this.#init = false; } - observe(subject, topic, data) { + observe(subject, topic) { switch (topic) { case "idle": lazy.logConsole.debug("Triggering all callbacks due to idle."); @@ -2167,17 +2144,13 @@ class CategorizationRecorder { */ /** - * Maps domain to categories, with data synced with Remote Settings. + * Maps domain to categories, with its data synced using Remote Settings. The + * data is downloaded from Remote Settings and stored in a map in a worker + * thread to avoid processing the data from the attachments from occupying + * the main thread. */ class DomainToCategoriesMap { /** - * Contains the domain to category scores. - * - * @type {Object<string, Array<DomainCategoryScore>> | null} - */ - #map = null; - - /** * Latest version number of the attachments. * * @type {number | null} @@ -2222,6 +2195,17 @@ class DomainToCategoriesMap { #downloadRetries = 0; /** + * Whether the mappings are empty. + */ + #empty = true; + + /** + * @type {BasePromiseWorker|null} Worker used to access the raw domain + * to categories map data. + */ + #worker = null; + + /** * Runs at application startup with startup idle tasks. If the SERP * categorization preference is enabled, it creates a Remote Settings * client to listen to updates, and populates the map. @@ -2231,14 +2215,18 @@ class DomainToCategoriesMap { return; } lazy.logConsole.debug("Initializing domain-to-categories map."); - this.#setupClientAndMap(); + this.#worker = new lazy.BasePromiseWorker( + "resource:///modules/DomainToCategoriesMap.worker.mjs", + { type: "module" } + ); + await this.#setupClientAndMap(); this.#init = true; } uninit() { if (this.#init) { lazy.logConsole.debug("Un-initializing domain-to-categories map."); - this.#clearClientAndMap(); + this.#clearClientAndWorker(); this.#cancelAndNullifyTimer(); this.#init = false; } @@ -2252,16 +2240,16 @@ class DomainToCategoriesMap { * An array containing categories and their respective score. If no record * for the domain is available, return an empty array. */ - get(domain) { + async get(domain) { if (this.empty) { return []; } - lazy.gCryptoHash.init(lazy.gCryptoHash.MD5); + lazy.gCryptoHash.init(lazy.gCryptoHash.SHA256); let bytes = new TextEncoder().encode(domain); lazy.gCryptoHash.update(bytes, domain.length); let hash = lazy.gCryptoHash.finish(true); - let rawValues = this.#map[hash] ?? []; - if (rawValues.length) { + let rawValues = await this.#worker.post("getScores", [hash]); + if (rawValues?.length) { let output = []; // Transform data into a more readable format. // [x, y] => { category: x, score: y } @@ -2292,7 +2280,7 @@ class DomainToCategoriesMap { * @returns {boolean} */ get empty() { - return !this.#map; + return this.#empty; } /** @@ -2303,8 +2291,11 @@ class DomainToCategoriesMap { * An object where the key is a hashed domain and the value is an array * containing an arbitrary number of DomainCategoryScores. */ - overrideMapForTests(domainToCategoriesMap) { - this.#map = domainToCategoriesMap; + async overrideMapForTests(domainToCategoriesMap) { + let hasResults = await this.#worker.post("overrideMapForTests", [ + domainToCategoriesMap, + ]); + this.#empty = !hasResults; } async #setupClientAndMap() { @@ -2321,7 +2312,7 @@ class DomainToCategoriesMap { await this.#clearAndPopulateMap(records); } - #clearClientAndMap() { + #clearClientAndWorker() { if (this.#client) { lazy.logConsole.debug("Removing Remote Settings client."); this.#client.off("sync", this.#onSettingsSync); @@ -2330,11 +2321,16 @@ class DomainToCategoriesMap { this.#downloadRetries = 0; } - if (this.#map) { + if (!this.#empty) { lazy.logConsole.debug("Clearing domain-to-categories map."); - this.#map = null; + this.#empty = true; this.#version = null; } + + if (this.#worker) { + this.#worker.terminate(); + this.#worker = null; + } } /** @@ -2394,11 +2390,11 @@ class DomainToCategoriesMap { * */ async #clearAndPopulateMap(records) { - // Set map to null so that if there are errors in the downloads, consumers - // will be able to know whether the map has information. Once we've - // successfully downloaded attachments and are parsing them, a non-null - // object will be created. - this.#map = null; + // Empty map so that if there are errors in the download process, callers + // querying the map won't use information we know is already outdated. + await this.#worker.post("emptyMap"); + + this.#empty = true; this.#version = null; this.#cancelAndNullifyTimer(); @@ -2408,6 +2404,7 @@ class DomainToCategoriesMap { } let fileContents = []; + let start = Cu.now(); for (let record of records) { let result; // Downloading attachments can fail. @@ -2420,10 +2417,13 @@ class DomainToCategoriesMap { } fileContents.push(result.buffer); } + ChromeUtils.addProfilerMarker( + "SearchSERPTelemetry.#clearAndPopulateMap", + start, + "Download attachments." + ); - // All attachments should have the same version number. If for whatever - // reason they don't, we should only use the attachments with the latest - // version. + // Attachments should have a version number. this.#version = this.#retrieveLatestVersion(records); if (!this.#version) { @@ -2431,37 +2431,28 @@ class DomainToCategoriesMap { return; } - // Queue the series of assignments. - for (let i = 0; i < fileContents.length; ++i) { - let buffer = fileContents[i]; - Services.tm.idleDispatchToMainThread(() => { - let start = Cu.now(); - let json; - try { - json = JSON.parse(new TextDecoder().decode(buffer)); - } catch (ex) { - // TODO: If there was an error decoding the buffer, we may want to - // dispatch an error in telemetry or try again. - return; - } - ChromeUtils.addProfilerMarker( - "SearchSERPTelemetry.#clearAndPopulateMap", - start, - "Convert buffer to JSON." - ); - if (!this.#map) { - this.#map = {}; - } - Object.assign(this.#map, json); - lazy.logConsole.debug("Updated domain-to-categories map."); - if (i == fileContents.length - 1) { - Services.obs.notifyObservers( - null, - "domain-to-categories-map-update-complete" - ); - } - }); - } + Services.tm.idleDispatchToMainThread(async () => { + start = Cu.now(); + let hasResults; + try { + hasResults = await this.#worker.post("populateMap", [fileContents]); + } catch (ex) { + console.error(ex); + } + + this.#empty = !hasResults; + + ChromeUtils.addProfilerMarker( + "SearchSERPTelemetry.#clearAndPopulateMap", + start, + "Convert contents to JSON." + ); + lazy.logConsole.debug("Updated domain-to-categories map."); + Services.obs.notifyObservers( + null, + "domain-to-categories-map-update-complete" + ); + }); } #cancelAndNullifyTimer() { |