summaryrefslogtreecommitdiffstats
path: root/browser/actors/SearchSERPTelemetryChild.sys.mjs
diff options
context:
space:
mode:
Diffstat (limited to 'browser/actors/SearchSERPTelemetryChild.sys.mjs')
-rw-r--r--browser/actors/SearchSERPTelemetryChild.sys.mjs68
1 files changed, 42 insertions, 26 deletions
diff --git a/browser/actors/SearchSERPTelemetryChild.sys.mjs b/browser/actors/SearchSERPTelemetryChild.sys.mjs
index c760f9a19e..b2b78941ad 100644
--- a/browser/actors/SearchSERPTelemetryChild.sys.mjs
+++ b/browser/actors/SearchSERPTelemetryChild.sys.mjs
@@ -13,13 +13,6 @@ ChromeUtils.defineESModuleGetters(lazy, {
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
- "serpEventsEnabled",
- "browser.search.serpEventTelemetry.enabled",
- true
-);
-
-XPCOMUtils.defineLazyPreferenceGetter(
- lazy,
"serpEventTelemetryCategorization",
"browser.search.serpEventTelemetryCategorization.enabled",
false
@@ -1077,7 +1070,11 @@ class DomainExtractor {
break;
}
case "textContent": {
- this.#fromElementsRetrieveTextContent(elements, extractedDomains);
+ this.#fromElementsRetrieveTextContent(
+ elements,
+ extractedDomains,
+ providerName
+ );
break;
}
}
@@ -1197,8 +1194,26 @@ class DomainExtractor {
* A list of elements from the page whose text content we want to inspect.
* @param {Set<string>} extractedDomains
* The result set of domains extracted from the page.
+ * @param {string} providerName
+ * The name of the search provider.
*/
- #fromElementsRetrieveTextContent(elements, extractedDomains) {
+ #fromElementsRetrieveTextContent(elements, extractedDomains, providerName) {
+ // Not an exhaustive regex, but it fits our purpose for this method.
+ const LOOSE_URL_REGEX =
+ /^(?:https?:\/\/)?(?:www\.)?(?:[\w\-]+\.)+(?:[\w\-]{2,})/i;
+
+ // Known but acceptable limitations to this function, where the return
+ // value won't be correctly fixed up:
+ // 1) A url is embedded within other text. Ex: "xkcd.com is cool."
+ // 2) The url contains legal but unusual characters. Ex: $ ! * '
+ function fixup(textContent) {
+ return textContent
+ .toLowerCase()
+ .replaceAll(" ", "")
+ .replace(/\.$/, "")
+ .concat(".com");
+ }
+
for (let element of elements) {
if (this.#exceedsThreshold(extractedDomains.size)) {
return;
@@ -1209,18 +1224,24 @@ class DomainExtractor {
}
let domain;
- try {
- domain = new URL(textContent).hostname;
- } catch (e) {
- domain = textContent.toLowerCase().replaceAll(" ", "");
- // If the attempt to turn the text content into a URL object only fails
- // because we're missing a protocol, ".com" may already be present.
- if (!domain.endsWith(".com")) {
- domain = domain.concat(".com");
+ if (LOOSE_URL_REGEX.test(textContent)) {
+ // Creating a new URL object will throw if the protocol is missing.
+ if (!/^https?:\/\//.test(textContent)) {
+ textContent = "https://" + textContent;
}
+
+ try {
+ domain = new URL(textContent).hostname;
+ } catch (e) {
+ domain = fixup(textContent);
+ }
+ } else {
+ domain = fixup(textContent);
}
- if (!extractedDomains.has(domain)) {
- extractedDomains.add(domain);
+
+ let processedDomain = this.#processDomain(domain, providerName);
+ if (processedDomain && !extractedDomains.has(processedDomain)) {
+ extractedDomains.add(processedDomain);
}
}
}
@@ -1368,7 +1389,6 @@ export class SearchSERPTelemetryChild extends JSWindowActorChild {
}
if (
- lazy.serpEventsEnabled &&
providerInfo.components?.length &&
(eventType == "load" || eventType == "pageshow")
) {
@@ -1496,17 +1516,13 @@ export class SearchSERPTelemetryChild extends JSWindowActorChild {
// so that we remain consistent with the *.in-content:sap* count for the
// SEARCH_COUNTS histogram.
if (event.persisted) {
+ this.#checkForPageImpressionComponents();
this.#check(event.type);
- if (lazy.serpEventsEnabled) {
- this.#checkForPageImpressionComponents();
- }
}
break;
}
case "DOMContentLoaded": {
- if (lazy.serpEventsEnabled) {
- this.#checkForPageImpressionComponents();
- }
+ this.#checkForPageImpressionComponents();
this.#check(event.type);
break;
}