summaryrefslogtreecommitdiffstats
path: root/browser/components/search/SearchSERPTelemetry.sys.mjs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 01:13:27 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 01:13:27 +0000
commit40a355a42d4a9444dc753c04c6608dade2f06a23 (patch)
tree871fc667d2de662f171103ce5ec067014ef85e61 /browser/components/search/SearchSERPTelemetry.sys.mjs
parentAdding upstream version 124.0.1. (diff)
downloadfirefox-40a355a42d4a9444dc753c04c6608dade2f06a23.tar.xz
firefox-40a355a42d4a9444dc753c04c6608dade2f06a23.zip
Adding upstream version 125.0.1.upstream/125.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'browser/components/search/SearchSERPTelemetry.sys.mjs')
-rw-r--r--browser/components/search/SearchSERPTelemetry.sys.mjs305
1 files changed, 148 insertions, 157 deletions
diff --git a/browser/components/search/SearchSERPTelemetry.sys.mjs b/browser/components/search/SearchSERPTelemetry.sys.mjs
index 00105241bb..fa593be08c 100644
--- a/browser/components/search/SearchSERPTelemetry.sys.mjs
+++ b/browser/components/search/SearchSERPTelemetry.sys.mjs
@@ -7,6 +7,7 @@ import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
+ BasePromiseWorker: "resource://gre/modules/PromiseWorker.sys.mjs",
BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs",
PrivateBrowsingUtils: "resource://gre/modules/PrivateBrowsingUtils.sys.mjs",
Region: "resource://gre/modules/Region.sys.mjs",
@@ -94,6 +95,10 @@ XPCOMUtils.defineLazyPreferenceGetter(
export const SearchSERPTelemetryUtils = {
ACTIONS: {
CLICKED: "clicked",
+ // specific to cookie banner
+ CLICKED_ACCEPT: "clicked_accept",
+ CLICKED_REJECT: "clicked_reject",
+ CLICKED_MORE_OPTIONS: "clicked_more_options",
EXPANDED: "expanded",
SUBMITTED: "submitted",
},
@@ -103,6 +108,7 @@ export const SearchSERPTelemetryUtils = {
AD_LINK: "ad_link",
AD_SIDEBAR: "ad_sidebar",
AD_SITELINK: "ad_sitelink",
+ COOKIE_BANNER: "cookie_banner",
INCONTENT_SEARCHBOX: "incontent_searchbox",
NON_ADS_LINK: "non_ads_link",
REFINED_SEARCH_BUTTONS: "refined_search_buttons",
@@ -403,6 +409,10 @@ class TelemetryHandler {
);
}
+ newProvider.ignoreLinkRegexps = provider.ignoreLinkRegexps?.length
+ ? provider.ignoreLinkRegexps.map(r => new RegExp(r))
+ : [];
+
newProvider.nonAdsLinkRegexps = provider.nonAdsLinkRegexps?.length
? provider.nonAdsLinkRegexps.map(r => new RegExp(r))
: [];
@@ -412,6 +422,9 @@ class TelemetryHandler {
regexp: new RegExp(provider.shoppingTab.regexp),
};
}
+
+ newProvider.nonAdsLinkQueryParamNames =
+ provider.nonAdsLinkQueryParamNames ?? [];
return newProvider;
});
this._contentHandler._searchProviderInfo = this._searchProviderInfo;
@@ -429,8 +442,8 @@ class TelemetryHandler {
this._contentHandler._reportPageWithAdImpressions(info, browser);
}
- reportPageDomains(info, browser) {
- this._contentHandler._reportPageDomains(info, browser);
+ async reportPageDomains(info, browser) {
+ await this._contentHandler._reportPageDomains(info, browser);
}
reportPageImpression(info, browser) {
@@ -1212,7 +1225,7 @@ class ContentHandler {
);
}
- observe(aSubject, aTopic, aData) {
+ observe(aSubject, aTopic) {
switch (aTopic) {
case "http-on-stop-request":
this._reportChannelBandwidth(aSubject);
@@ -1330,6 +1343,11 @@ class ContentHandler {
let originURL = wrappedChannel.originURI?.spec;
let url = wrappedChannel.finalURL;
+
+ if (info.ignoreLinkRegexps.some(r => r.test(url))) {
+ return;
+ }
+
// Some channels re-direct by loading pages that return 200. The result
// is the channel will have an originURL that changes from the SERP to
// either a nonAdsRegexp or an extraAdServersRegexps. This is typical
@@ -1434,6 +1452,30 @@ class ContentHandler {
let startFindComponent = Cu.now();
let parsedUrl = new URL(url);
+
+ // Organic links may contain query param values mapped to links shown
+ // on the SERP at page load. If a stored component depends on that
+ // value, we need to be able to recover it or else we'll always consider
+ // it a non_ads_link.
+ if (
+ info.nonAdsLinkQueryParamNames.length &&
+ info.nonAdsLinkRegexps.some(r => r.test(url))
+ ) {
+ let newParsedUrl;
+ for (let key of info.nonAdsLinkQueryParamNames) {
+ let paramValue = parsedUrl.searchParams.get(key);
+ if (paramValue) {
+ try {
+ newParsedUrl = /^https?:\/\//.test(paramValue)
+ ? new URL(paramValue)
+ : new URL(paramValue, parsedUrl.origin);
+ break;
+ } catch (e) {}
+ }
+ }
+ parsedUrl = newParsedUrl ?? parsedUrl;
+ }
+
// Determine the component type of the link.
let type;
for (let [
@@ -1629,8 +1671,8 @@ class ContentHandler {
*
* @param {object} info
* The search provider infomation for the page.
- * @param {string} info.type
- * The component type that was clicked on.
+ * @param {string} info.target
+ * The target component that was interacted with.
* @param {string} info.action
* The action taken on the page.
* @param {object} browser
@@ -1643,22 +1685,23 @@ class ContentHandler {
}
let telemetryState = item.browserTelemetryStateMap.get(browser);
let impressionId = telemetryState?.impressionId;
- if (info.type && impressionId) {
+ if (info.target && impressionId) {
lazy.logConsole.debug(`Recorded page action:`, {
impressionId: telemetryState.impressionId,
- type: info.type,
+ target: info.target,
action: info.action,
});
Glean.serp.engagement.record({
impression_id: impressionId,
action: info.action,
- target: info.type,
+ target: info.target,
});
impressionIdsWithoutEngagementsSet.delete(impressionId);
// In-content searches are not be categorized with a type, so they will
// not be picked up in the network processes.
if (
- info.type == SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX &&
+ info.target ==
+ SearchSERPTelemetryUtils.COMPONENTS.INCONTENT_SEARCHBOX &&
info.action == SearchSERPTelemetryUtils.ACTIONS.SUBMITTED
) {
telemetryState.searchBoxSubmitted = true;
@@ -1667,6 +1710,7 @@ class ContentHandler {
SearchSERPTelemetryUtils.INCONTENT_SOURCES.SEARCHBOX
);
}
+ Services.obs.notifyObservers(null, "reported-page-with-action");
} else {
lazy.logConsole.warn(
"Expected to report a",
@@ -1712,23 +1756,23 @@ class ContentHandler {
}
/**
- * Initiates the categorization and reporting of domains extracted from
- * SERPs.
- *
- * @param {object} info
- * The search provider infomation for the page.
- * @param {Set} info.nonAdDomains
- The non-ad domains extracted from the page.
- * @param {Set} info.adDomains
- The ad domains extracted from the page.
- * @param {object} browser
- * The browser associated with the page.
- */
- _reportPageDomains(info, browser) {
+ * Initiates the categorization and reporting of domains extracted from
+ * SERPs.
+ *
+ * @param {object} info
+ * The search provider infomation for the page.
+ * @param {Set} info.nonAdDomains
+ The non-ad domains extracted from the page.
+ * @param {Set} info.adDomains
+ The ad domains extracted from the page.
+ * @param {object} browser
+ * The browser associated with the page.
+ */
+ async _reportPageDomains(info, browser) {
let item = this._findItemForBrowser(browser);
let telemetryState = item.browserTelemetryStateMap.get(browser);
if (lazy.serpEventTelemetryCategorization && telemetryState) {
- let result = SearchSERPCategorization.maybeCategorizeSERP(
+ let result = await SearchSERPCategorization.maybeCategorizeSERP(
info.nonAdDomains,
info.adDomains,
item.info.provider
@@ -1808,12 +1852,10 @@ class SERPCategorizer {
* Domains from organic results extracted from the page.
* @param {Set} adDomains
* Domains from ad results extracted from the page.
- * @param {string} provider
- * The provider associated with the page.
* @returns {CategorizationResult | null}
* The final categorization result. Returns null if the map was empty.
*/
- maybeCategorizeSERP(nonAdDomains, adDomains, provider) {
+ async maybeCategorizeSERP(nonAdDomains, adDomains) {
// Per DS, if the map was empty (e.g. because of a technical issue
// downloading the data), we shouldn't report telemetry.
// Thus, there is no point attempting to categorize the SERP.
@@ -1822,15 +1864,13 @@ class SERPCategorizer {
}
let resultsToReport = {};
- let processedDomains = this.processDomains(nonAdDomains, provider);
- let results = this.applyCategorizationLogic(processedDomains);
+ let results = await this.applyCategorizationLogic(nonAdDomains);
resultsToReport.organic_category = results.category;
resultsToReport.organic_num_domains = results.num_domains;
resultsToReport.organic_num_unknown = results.num_unknown;
resultsToReport.organic_num_inconclusive = results.num_inconclusive;
- processedDomains = this.processDomains(adDomains, provider);
- results = this.applyCategorizationLogic(processedDomains);
+ results = await this.applyCategorizationLogic(adDomains);
resultsToReport.sponsored_category = results.category;
resultsToReport.sponsored_num_domains = results.num_domains;
resultsToReport.sponsored_num_unknown = results.num_unknown;
@@ -1851,22 +1891,18 @@ class SERPCategorizer {
* The final categorization results. Keys are: "category", "num_domains",
* "num_unknown" and "num_inconclusive".
*/
- applyCategorizationLogic(domains) {
+ async applyCategorizationLogic(domains) {
let domainInfo = {};
let domainsCount = 0;
let unknownsCount = 0;
let inconclusivesCount = 0;
- // Per a request from Data Science, we need to limit the number of domains
- // categorized to 10 non-ad domains and 10 ad domains.
- domains = new Set(
- [...domains].slice(0, CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE)
- );
-
for (let domain of domains) {
domainsCount++;
- let categoryCandidates = SearchSERPDomainToCategoriesMap.get(domain);
+ let categoryCandidates = await SearchSERPDomainToCategoriesMap.get(
+ domain
+ );
if (!categoryCandidates.length) {
unknownsCount++;
@@ -1919,65 +1955,6 @@ class SERPCategorizer {
};
}
- /**
- * Processes raw domains extracted from the SERP into their final form before
- * categorization.
- *
- * @param {Set} domains
- * The domains extracted from the page.
- * @param {string} provider
- * The provider associated with the page.
- * @returns {Set} processedDomains
- * The final set of processed domains for a page.
- */
- processDomains(domains, provider) {
- let processedDomains = new Set();
-
- for (let domain of domains) {
- // Don't include domains associated with the search provider.
- if (
- domain.startsWith(`${provider}.`) ||
- domain.includes(`.${provider}.`)
- ) {
- continue;
- }
- let domainWithoutSubdomains = this.#stripDomainOfSubdomains(domain);
- // We may have come across the same domain twice, once with www. prefixed
- // and another time without.
- if (
- domainWithoutSubdomains &&
- !processedDomains.has(domainWithoutSubdomains)
- ) {
- processedDomains.add(domainWithoutSubdomains);
- }
- }
-
- return processedDomains;
- }
-
- /**
- * Helper to strip domains of any subdomains.
- *
- * @param {string} domain
- * The domain to strip of any subdomains.
- * @returns {object} browser
- * The given domain with any subdomains removed.
- */
- #stripDomainOfSubdomains(domain) {
- let tld;
- // Can throw an exception if the input has too few domain levels.
- try {
- tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
- } catch (ex) {
- return "";
- }
-
- let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
- let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
-
- return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
- }
-
#chooseRandomlyFrom(categories) {
let randIdx = Math.floor(Math.random() * categories.length);
return categories[randIdx];
@@ -2075,7 +2052,7 @@ class CategorizationEventScheduler {
this.#init = false;
}
- observe(subject, topic, data) {
+ observe(subject, topic) {
switch (topic) {
case "idle":
lazy.logConsole.debug("Triggering all callbacks due to idle.");
@@ -2167,17 +2144,13 @@ class CategorizationRecorder {
*/
/**
- * Maps domain to categories, with data synced with Remote Settings.
+ * Maps domain to categories, with its data synced using Remote Settings. The
+ * data is downloaded from Remote Settings and stored in a map in a worker
+ * thread to avoid processing the data from the attachments from occupying
+ * the main thread.
*/
class DomainToCategoriesMap {
/**
- * Contains the domain to category scores.
- *
- * @type {Object<string, Array<DomainCategoryScore>> | null}
- */
- #map = null;
-
- /**
* Latest version number of the attachments.
*
* @type {number | null}
@@ -2222,6 +2195,17 @@ class DomainToCategoriesMap {
#downloadRetries = 0;
/**
+ * Whether the mappings are empty.
+ */
+ #empty = true;
+
+ /**
+ * @type {BasePromiseWorker|null} Worker used to access the raw domain
+ * to categories map data.
+ */
+ #worker = null;
+
+ /**
* Runs at application startup with startup idle tasks. If the SERP
* categorization preference is enabled, it creates a Remote Settings
* client to listen to updates, and populates the map.
@@ -2231,14 +2215,18 @@ class DomainToCategoriesMap {
return;
}
lazy.logConsole.debug("Initializing domain-to-categories map.");
- this.#setupClientAndMap();
+ this.#worker = new lazy.BasePromiseWorker(
+ "resource:///modules/DomainToCategoriesMap.worker.mjs",
+ { type: "module" }
+ );
+ await this.#setupClientAndMap();
this.#init = true;
}
uninit() {
if (this.#init) {
lazy.logConsole.debug("Un-initializing domain-to-categories map.");
- this.#clearClientAndMap();
+ this.#clearClientAndWorker();
this.#cancelAndNullifyTimer();
this.#init = false;
}
@@ -2252,16 +2240,16 @@ class DomainToCategoriesMap {
* An array containing categories and their respective score. If no record
* for the domain is available, return an empty array.
*/
- get(domain) {
+ async get(domain) {
if (this.empty) {
return [];
}
- lazy.gCryptoHash.init(lazy.gCryptoHash.MD5);
+ lazy.gCryptoHash.init(lazy.gCryptoHash.SHA256);
let bytes = new TextEncoder().encode(domain);
lazy.gCryptoHash.update(bytes, domain.length);
let hash = lazy.gCryptoHash.finish(true);
- let rawValues = this.#map[hash] ?? [];
- if (rawValues.length) {
+ let rawValues = await this.#worker.post("getScores", [hash]);
+ if (rawValues?.length) {
let output = [];
// Transform data into a more readable format.
// [x, y] => { category: x, score: y }
@@ -2292,7 +2280,7 @@ class DomainToCategoriesMap {
* @returns {boolean}
*/
get empty() {
- return !this.#map;
+ return this.#empty;
}
/**
@@ -2303,8 +2291,11 @@ class DomainToCategoriesMap {
* An object where the key is a hashed domain and the value is an array
* containing an arbitrary number of DomainCategoryScores.
*/
- overrideMapForTests(domainToCategoriesMap) {
- this.#map = domainToCategoriesMap;
+ async overrideMapForTests(domainToCategoriesMap) {
+ let hasResults = await this.#worker.post("overrideMapForTests", [
+ domainToCategoriesMap,
+ ]);
+ this.#empty = !hasResults;
}
async #setupClientAndMap() {
@@ -2321,7 +2312,7 @@ class DomainToCategoriesMap {
await this.#clearAndPopulateMap(records);
}
- #clearClientAndMap() {
+ #clearClientAndWorker() {
if (this.#client) {
lazy.logConsole.debug("Removing Remote Settings client.");
this.#client.off("sync", this.#onSettingsSync);
@@ -2330,11 +2321,16 @@ class DomainToCategoriesMap {
this.#downloadRetries = 0;
}
- if (this.#map) {
+ if (!this.#empty) {
lazy.logConsole.debug("Clearing domain-to-categories map.");
- this.#map = null;
+ this.#empty = true;
this.#version = null;
}
+
+ if (this.#worker) {
+ this.#worker.terminate();
+ this.#worker = null;
+ }
}
/**
@@ -2394,11 +2390,11 @@ class DomainToCategoriesMap {
*
*/
async #clearAndPopulateMap(records) {
- // Set map to null so that if there are errors in the downloads, consumers
- // will be able to know whether the map has information. Once we've
- // successfully downloaded attachments and are parsing them, a non-null
- // object will be created.
- this.#map = null;
+ // Empty map so that if there are errors in the download process, callers
+ // querying the map won't use information we know is already outdated.
+ await this.#worker.post("emptyMap");
+
+ this.#empty = true;
this.#version = null;
this.#cancelAndNullifyTimer();
@@ -2408,6 +2404,7 @@ class DomainToCategoriesMap {
}
let fileContents = [];
+ let start = Cu.now();
for (let record of records) {
let result;
// Downloading attachments can fail.
@@ -2420,10 +2417,13 @@ class DomainToCategoriesMap {
}
fileContents.push(result.buffer);
}
+ ChromeUtils.addProfilerMarker(
+ "SearchSERPTelemetry.#clearAndPopulateMap",
+ start,
+ "Download attachments."
+ );
- // All attachments should have the same version number. If for whatever
- // reason they don't, we should only use the attachments with the latest
- // version.
+ // Attachments should have a version number.
this.#version = this.#retrieveLatestVersion(records);
if (!this.#version) {
@@ -2431,37 +2431,28 @@ class DomainToCategoriesMap {
return;
}
- // Queue the series of assignments.
- for (let i = 0; i < fileContents.length; ++i) {
- let buffer = fileContents[i];
- Services.tm.idleDispatchToMainThread(() => {
- let start = Cu.now();
- let json;
- try {
- json = JSON.parse(new TextDecoder().decode(buffer));
- } catch (ex) {
- // TODO: If there was an error decoding the buffer, we may want to
- // dispatch an error in telemetry or try again.
- return;
- }
- ChromeUtils.addProfilerMarker(
- "SearchSERPTelemetry.#clearAndPopulateMap",
- start,
- "Convert buffer to JSON."
- );
- if (!this.#map) {
- this.#map = {};
- }
- Object.assign(this.#map, json);
- lazy.logConsole.debug("Updated domain-to-categories map.");
- if (i == fileContents.length - 1) {
- Services.obs.notifyObservers(
- null,
- "domain-to-categories-map-update-complete"
- );
- }
- });
- }
+ Services.tm.idleDispatchToMainThread(async () => {
+ start = Cu.now();
+ let hasResults;
+ try {
+ hasResults = await this.#worker.post("populateMap", [fileContents]);
+ } catch (ex) {
+ console.error(ex);
+ }
+
+ this.#empty = !hasResults;
+
+ ChromeUtils.addProfilerMarker(
+ "SearchSERPTelemetry.#clearAndPopulateMap",
+ start,
+ "Convert contents to JSON."
+ );
+ lazy.logConsole.debug("Updated domain-to-categories map.");
+ Services.obs.notifyObservers(
+ null,
+ "domain-to-categories-map-update-complete"
+ );
+ });
}
#cancelAndNullifyTimer() {