diff options
Diffstat (limited to 'browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js')
-rw-r--r-- | browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js | 191 |
1 files changed, 173 insertions, 18 deletions
diff --git a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js index 03ddb75481..e653be6c48 100644 --- a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js +++ b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js @@ -11,6 +11,10 @@ ChromeUtils.defineESModuleGetters(this, { SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs", }); +// The search provider's name is provided to ensure we can extract domains +// from relative links, e.g. /url?=https://www.foobar.com +const SEARCH_PROVIDER_NAME = "example"; + const TESTS = [ { title: "Extract domain from href (absolute URL) - one link.", @@ -35,7 +39,7 @@ const TESTS = [ expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"], }, { - title: "Extract domain from href (relative URL).", + title: "Extract domain from href (relative URL / URL matching provider)", extractorInfos: [ { selectors: @@ -43,38 +47,33 @@ const TESTS = [ method: "href", }, ], - expectedDomains: ["example.org"], + expectedDomains: [], }, { title: "Extract domain from data attribute - one link.", extractorInfos: [ { selectors: "#test4 [data-dtld]", - method: "data-attribute", + method: "dataAttribute", options: { dataAttributeKey: "dtld", }, }, ], - expectedDomains: ["www.abc.com"], + expectedDomains: ["abc.com"], }, { title: "Extract domain from data attribute - multiple links.", extractorInfos: [ { selectors: "#test5 [data-dtld]", - method: "data-attribute", + method: "dataAttribute", options: { dataAttributeKey: "dtld", }, }, ], - expectedDomains: [ - "www.foo.com", - "www.bar.com", - "www.baz.com", - "www.qux.com", - ], + expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"], }, { title: "Extract domain from an href's query param value.", @@ -88,7 +87,7 @@ const TESTS = [ }, }, ], - expectedDomains: ["def.com"], + expectedDomains: ["def.com", "bar.com", "baz.com"], }, { title: @@ -144,7 +143,7 @@ const TESTS = [ }, { selectors: "#test10 [data-dtld]", - method: "data-attribute", + method: "dataAttribute", options: { dataAttributeKey: "dtld", }, @@ -158,7 +157,7 @@ const TESTS = [ }, }, ], - expectedDomains: ["foobar.com", "www.abc.com", "def.com"], + expectedDomains: ["foobar.com", "abc.com", "def.com"], }, { title: "No elements match the selectors.", @@ -176,7 +175,7 @@ const TESTS = [ extractorInfos: [ { selectors: "#test12 [data-dtld]", - method: "data-attribute", + method: "dataAttribute", options: { dataAttributeKey: "dtld", }, @@ -208,6 +207,161 @@ const TESTS = [ ], expectedDomains: [], }, + { + title: "Second-level domains to a top-level domain.", + extractorInfos: [ + { + selectors: "#test15 a", + method: "href", + }, + ], + expectedDomains: [ + "foobar.gc.ca", + "foobar.gov.uk", + "foobar.co.uk", + "foobar.co.il", + ], + }, + { + title: "URL with a long subdomain.", + extractorInfos: [ + { + selectors: "#test16 a", + method: "href", + }, + ], + expectedDomains: ["foobar.com"], + }, + { + title: "URLs with the same top level domain.", + extractorInfos: [ + { + selectors: "#test17 a", + method: "href", + }, + ], + expectedDomains: ["foobar.com"], + }, + { + title: "Maximum domains extracted from a single selector.", + extractorInfos: [ + { + selectors: "#test18 a", + method: "href", + }, + ], + expectedDomains: [ + "foobar1.com", + "foobar2.com", + "foobar3.com", + "foobar4.com", + "foobar5.com", + "foobar6.com", + "foobar7.com", + "foobar8.com", + "foobar9.com", + "foobar10.com", + ], + }, + { + // This is just in case we use multiple selectors meant for separate SERPs + // and the provider switches to re-using their markup. + title: "Maximum domains extracted from multiple matching selectors.", + extractorInfos: [ + { + selectors: "#test19 a.foo", + method: "href", + }, + { + selectors: "#test19 a.baz", + method: "href", + }, + ], + expectedDomains: [ + "foobar1.com", + "foobar2.com", + "foobar3.com", + "foobar4.com", + "foobar5.com", + "foobar6.com", + "foobar7.com", + "foobar8.com", + "foobar9.com", + // This is from the second selector. + "foobaz1.com", + ], + }, + { + title: "Bing organic result.", + extractorInfos: [ + { + selectors: "#test20 #b_results .b_algo .b_attribution cite", + method: "textContent", + }, + ], + expectedDomains: ["organic.com"], + }, + { + title: "Bing sponsored result.", + extractorInfos: [ + { + selectors: "#test21 #b_results .b_ad .b_attribution cite", + method: "textContent", + }, + ], + expectedDomains: ["sponsored.com"], + }, + { + title: "Bing carousel result.", + extractorInfos: [ + { + selectors: "#test22 .adsMvCarousel cite", + method: "textContent", + }, + ], + expectedDomains: ["fixedupfromthecarousel.com"], + }, + { + title: "Bing sidebar result.", + extractorInfos: [ + { + selectors: "#test23 aside cite", + method: "textContent", + }, + ], + expectedDomains: ["fixedupfromthesidebar.com"], + }, + { + title: "Extraction threshold respected using text content method.", + extractorInfos: [ + { + selectors: "#test24 #b_results .b_ad .b_attribution cite", + method: "textContent", + }, + ], + expectedDomains: [ + "sponsored1.com", + "sponsored2.com", + "sponsored3.com", + "sponsored4.com", + "sponsored5.com", + "sponsored6.com", + "sponsored7.com", + "sponsored8.com", + "sponsored9.com", + "sponsored10.com", + ], + }, + { + title: "Bing organic result with no protocol.", + extractorInfos: [ + { + selectors: "#test25 #b_results .b_algo .b_attribution cite", + method: "textContent", + }, + ], + expectedDomains: ["organic.com"], + }, ]; add_setup(async function () { @@ -240,14 +394,15 @@ add_task(async function test_domain_extraction_heuristics() { let expectedDomains = new Set(currentTest.expectedDomains); let actualDomains = await SpecialPowers.spawn( gBrowser.selectedBrowser, - [currentTest.extractorInfos], - extractorInfos => { + [currentTest.extractorInfos, SEARCH_PROVIDER_NAME], + (extractorInfos, searchProviderName) => { const { domainExtractor } = ChromeUtils.importESModule( "resource:///actors/SearchSERPTelemetryChild.sys.mjs" ); return domainExtractor.extractDomainsFromDocument( content.document, - extractorInfos + extractorInfos, + searchProviderName ); } ); |