summaryrefslogtreecommitdiffstats
path: root/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
diff options
context:
space:
mode:
Diffstat (limited to 'browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js')
-rw-r--r--browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js191
1 files changed, 173 insertions, 18 deletions
diff --git a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
index 03ddb75481..e653be6c48 100644
--- a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
+++ b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
@@ -11,6 +11,10 @@ ChromeUtils.defineESModuleGetters(this, {
SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
});
+// The search provider's name is provided to ensure we can extract domains
+// from relative links, e.g. /url?=https://www.foobar.com
+const SEARCH_PROVIDER_NAME = "example";
+
const TESTS = [
{
title: "Extract domain from href (absolute URL) - one link.",
@@ -35,7 +39,7 @@ const TESTS = [
expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
},
{
- title: "Extract domain from href (relative URL).",
+ title: "Extract domain from href (relative URL / URL matching provider)",
extractorInfos: [
{
selectors:
@@ -43,38 +47,33 @@ const TESTS = [
method: "href",
},
],
- expectedDomains: ["example.org"],
+ expectedDomains: [],
},
{
title: "Extract domain from data attribute - one link.",
extractorInfos: [
{
selectors: "#test4 [data-dtld]",
- method: "data-attribute",
+ method: "dataAttribute",
options: {
dataAttributeKey: "dtld",
},
},
],
- expectedDomains: ["www.abc.com"],
+ expectedDomains: ["abc.com"],
},
{
title: "Extract domain from data attribute - multiple links.",
extractorInfos: [
{
selectors: "#test5 [data-dtld]",
- method: "data-attribute",
+ method: "dataAttribute",
options: {
dataAttributeKey: "dtld",
},
},
],
- expectedDomains: [
- "www.foo.com",
- "www.bar.com",
- "www.baz.com",
- "www.qux.com",
- ],
+ expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
},
{
title: "Extract domain from an href's query param value.",
@@ -88,7 +87,7 @@ const TESTS = [
},
},
],
- expectedDomains: ["def.com"],
+ expectedDomains: ["def.com", "bar.com", "baz.com"],
},
{
title:
@@ -144,7 +143,7 @@ const TESTS = [
},
{
selectors: "#test10 [data-dtld]",
- method: "data-attribute",
+ method: "dataAttribute",
options: {
dataAttributeKey: "dtld",
},
@@ -158,7 +157,7 @@ const TESTS = [
},
},
],
- expectedDomains: ["foobar.com", "www.abc.com", "def.com"],
+ expectedDomains: ["foobar.com", "abc.com", "def.com"],
},
{
title: "No elements match the selectors.",
@@ -176,7 +175,7 @@ const TESTS = [
extractorInfos: [
{
selectors: "#test12 [data-dtld]",
- method: "data-attribute",
+ method: "dataAttribute",
options: {
dataAttributeKey: "dtld",
},
@@ -208,6 +207,161 @@ const TESTS = [
],
expectedDomains: [],
},
+ {
+ title: "Second-level domains to a top-level domain.",
+ extractorInfos: [
+ {
+ selectors: "#test15 a",
+ method: "href",
+ },
+ ],
+ expectedDomains: [
+ "foobar.gc.ca",
+ "foobar.gov.uk",
+ "foobar.co.uk",
+ "foobar.co.il",
+ ],
+ },
+ {
+ title: "URL with a long subdomain.",
+ extractorInfos: [
+ {
+ selectors: "#test16 a",
+ method: "href",
+ },
+ ],
+ expectedDomains: ["foobar.com"],
+ },
+ {
+ title: "URLs with the same top level domain.",
+ extractorInfos: [
+ {
+ selectors: "#test17 a",
+ method: "href",
+ },
+ ],
+ expectedDomains: ["foobar.com"],
+ },
+ {
+ title: "Maximum domains extracted from a single selector.",
+ extractorInfos: [
+ {
+ selectors: "#test18 a",
+ method: "href",
+ },
+ ],
+ expectedDomains: [
+ "foobar1.com",
+ "foobar2.com",
+ "foobar3.com",
+ "foobar4.com",
+ "foobar5.com",
+ "foobar6.com",
+ "foobar7.com",
+ "foobar8.com",
+ "foobar9.com",
+ "foobar10.com",
+ ],
+ },
+ {
+ // This is just in case we use multiple selectors meant for separate SERPs
+ // and the provider switches to re-using their markup.
+ title: "Maximum domains extracted from multiple matching selectors.",
+ extractorInfos: [
+ {
+ selectors: "#test19 a.foo",
+ method: "href",
+ },
+ {
+ selectors: "#test19 a.baz",
+ method: "href",
+ },
+ ],
+ expectedDomains: [
+ "foobar1.com",
+ "foobar2.com",
+ "foobar3.com",
+ "foobar4.com",
+ "foobar5.com",
+ "foobar6.com",
+ "foobar7.com",
+ "foobar8.com",
+ "foobar9.com",
+ // This is from the second selector.
+ "foobaz1.com",
+ ],
+ },
+ {
+ title: "Bing organic result.",
+ extractorInfos: [
+ {
+ selectors: "#test20 #b_results .b_algo .b_attribution cite",
+ method: "textContent",
+ },
+ ],
+ expectedDomains: ["organic.com"],
+ },
+ {
+ title: "Bing sponsored result.",
+ extractorInfos: [
+ {
+ selectors: "#test21 #b_results .b_ad .b_attribution cite",
+ method: "textContent",
+ },
+ ],
+ expectedDomains: ["sponsored.com"],
+ },
+ {
+ title: "Bing carousel result.",
+ extractorInfos: [
+ {
+ selectors: "#test22 .adsMvCarousel cite",
+ method: "textContent",
+ },
+ ],
+ expectedDomains: ["fixedupfromthecarousel.com"],
+ },
+ {
+ title: "Bing sidebar result.",
+ extractorInfos: [
+ {
+ selectors: "#test23 aside cite",
+ method: "textContent",
+ },
+ ],
+ expectedDomains: ["fixedupfromthesidebar.com"],
+ },
+ {
+ title: "Extraction threshold respected using text content method.",
+ extractorInfos: [
+ {
+ selectors: "#test24 #b_results .b_ad .b_attribution cite",
+ method: "textContent",
+ },
+ ],
+ expectedDomains: [
+ "sponsored1.com",
+ "sponsored2.com",
+ "sponsored3.com",
+ "sponsored4.com",
+ "sponsored5.com",
+ "sponsored6.com",
+ "sponsored7.com",
+ "sponsored8.com",
+ "sponsored9.com",
+ "sponsored10.com",
+ ],
+ },
+ {
+ title: "Bing organic result with no protocol.",
+ extractorInfos: [
+ {
+ selectors: "#test25 #b_results .b_algo .b_attribution cite",
+ method: "textContent",
+ },
+ ],
+ expectedDomains: ["organic.com"],
+ },
];
add_setup(async function () {
@@ -240,14 +394,15 @@ add_task(async function test_domain_extraction_heuristics() {
let expectedDomains = new Set(currentTest.expectedDomains);
let actualDomains = await SpecialPowers.spawn(
gBrowser.selectedBrowser,
- [currentTest.extractorInfos],
- extractorInfos => {
+ [currentTest.extractorInfos, SEARCH_PROVIDER_NAME],
+ (extractorInfos, searchProviderName) => {
const { domainExtractor } = ChromeUtils.importESModule(
"resource:///actors/SearchSERPTelemetryChild.sys.mjs"
);
return domainExtractor.extractDomainsFromDocument(
content.document,
- extractorInfos
+ extractorInfos,
+ searchProviderName
);
}
);