1 files changed, 173 insertions, 18 deletions
diff --git a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
index 03ddb75481..e653be6c48 100644
--- a/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
+++ b/browser/components/search/test/browser/telemetry/browser_search_telemetry_domain_categorization_extraction.js
@@ -11,6 +11,10 @@ ChromeUtils.defineESModuleGetters(this, {
   SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs",
 });
 
+// The search provider's name is provided to ensure we can extract domains
+// from relative links, e.g. /url?=https://www.foobar.com
+const SEARCH_PROVIDER_NAME = "example";
+
 const TESTS = [
   {
     title: "Extract domain from href (absolute URL) - one link.",
@@ -35,7 +39,7 @@ const TESTS = [
     expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
   },
   {
-    title: "Extract domain from href (relative URL).",
+    title: "Extract domain from href (relative URL / URL matching provider)",
     extractorInfos: [
       {
         selectors:
@@ -43,38 +47,33 @@ const TESTS = [
         method: "href",
       },
     ],
-    expectedDomains: ["example.org"],
+    expectedDomains: [],
   },
   {
     title: "Extract domain from data attribute - one link.",
     extractorInfos: [
       {
         selectors: "#test4 [data-dtld]",
-        method: "data-attribute",
+        method: "dataAttribute",
         options: {
           dataAttributeKey: "dtld",
         },
       },
     ],
-    expectedDomains: ["www.abc.com"],
+    expectedDomains: ["abc.com"],
   },
   {
     title: "Extract domain from data attribute - multiple links.",
     extractorInfos: [
       {
         selectors: "#test5 [data-dtld]",
-        method: "data-attribute",
+        method: "dataAttribute",
         options: {
           dataAttributeKey: "dtld",
         },
       },
     ],
-    expectedDomains: [
-      "www.foo.com",
-      "www.bar.com",
-      "www.baz.com",
-      "www.qux.com",
-    ],
+    expectedDomains: ["foo.com", "bar.com", "baz.com", "qux.com"],
   },
   {
     title: "Extract domain from an href's query param value.",
@@ -88,7 +87,7 @@ const TESTS = [
         },
       },
     ],
-    expectedDomains: ["def.com"],
+    expectedDomains: ["def.com", "bar.com", "baz.com"],
   },
   {
     title:
@@ -144,7 +143,7 @@ const TESTS = [
       },
       {
         selectors: "#test10 [data-dtld]",
-        method: "data-attribute",
+        method: "dataAttribute",
         options: {
           dataAttributeKey: "dtld",
         },
@@ -158,7 +157,7 @@ const TESTS = [
         },
       },
     ],
-    expectedDomains: ["foobar.com", "www.abc.com", "def.com"],
+    expectedDomains: ["foobar.com", "abc.com", "def.com"],
   },
   {
     title: "No elements match the selectors.",
@@ -176,7 +175,7 @@ const TESTS = [
     extractorInfos: [
       {
         selectors: "#test12 [data-dtld]",
-        method: "data-attribute",
+        method: "dataAttribute",
         options: {
           dataAttributeKey: "dtld",
         },
@@ -208,6 +207,161 @@ const TESTS = [
     ],
     expectedDomains: [],
   },
+  {
+    title: "Second-level domains to a top-level domain.",
+    extractorInfos: [
+      {
+        selectors: "#test15 a",
+        method: "href",
+      },
+    ],
+    expectedDomains: [
+      "foobar.gc.ca",
+      "foobar.gov.uk",
+      "foobar.co.uk",
+      "foobar.co.il",
+    ],
+  },
+  {
+    title: "URL with a long subdomain.",
+    extractorInfos: [
+      {
+        selectors: "#test16 a",
+        method: "href",
+      },
+    ],
+    expectedDomains: ["foobar.com"],
+  },
+  {
+    title: "URLs with the same top level domain.",
+    extractorInfos: [
+      {
+        selectors: "#test17 a",
+        method: "href",
+      },
+    ],
+    expectedDomains: ["foobar.com"],
+  },
+  {
+    title: "Maximum domains extracted from a single selector.",
+    extractorInfos: [
+      {
+        selectors: "#test18 a",
+        method: "href",
+      },
+    ],
+    expectedDomains: [
+      "foobar1.com",
+      "foobar2.com",
+      "foobar3.com",
+      "foobar4.com",
+      "foobar5.com",
+      "foobar6.com",
+      "foobar7.com",
+      "foobar8.com",
+      "foobar9.com",
+      "foobar10.com",
+    ],
+  },
+  {
+    // This is just in case we use multiple selectors meant for separate SERPs
+    // and the provider switches to re-using their markup.
+    title: "Maximum domains extracted from multiple matching selectors.",
+    extractorInfos: [
+      {
+        selectors: "#test19 a.foo",
+        method: "href",
+      },
+      {
+        selectors: "#test19 a.baz",
+        method: "href",
+      },
+    ],
+    expectedDomains: [
+      "foobar1.com",
+      "foobar2.com",
+      "foobar3.com",
+      "foobar4.com",
+      "foobar5.com",
+      "foobar6.com",
+      "foobar7.com",
+      "foobar8.com",
+      "foobar9.com",
+      // This is from the second selector.
+      "foobaz1.com",
+    ],
+  },
+  {
+    title: "Bing organic result.",
+    extractorInfos: [
+      {
+        selectors: "#test20 #b_results .b_algo .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["organic.com"],
+  },
+  {
+    title: "Bing sponsored result.",
+    extractorInfos: [
+      {
+        selectors: "#test21 #b_results .b_ad .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["sponsored.com"],
+  },
+  {
+    title: "Bing carousel result.",
+    extractorInfos: [
+      {
+        selectors: "#test22 .adsMvCarousel cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["fixedupfromthecarousel.com"],
+  },
+  {
+    title: "Bing sidebar result.",
+    extractorInfos: [
+      {
+        selectors: "#test23 aside cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["fixedupfromthesidebar.com"],
+  },
+  {
+    title: "Extraction threshold respected using text content method.",
+    extractorInfos: [
+      {
+        selectors: "#test24 #b_results .b_ad .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: [
+      "sponsored1.com",
+      "sponsored2.com",
+      "sponsored3.com",
+      "sponsored4.com",
+      "sponsored5.com",
+      "sponsored6.com",
+      "sponsored7.com",
+      "sponsored8.com",
+      "sponsored9.com",
+      "sponsored10.com",
+    ],
+  },
+  {
+    title: "Bing organic result with no protocol.",
+    extractorInfos: [
+      {
+        selectors: "#test25 #b_results .b_algo .b_attribution cite",
+        method: "textContent",
+      },
+    ],
+    expectedDomains: ["organic.com"],
+  },
 ];
 
 add_setup(async function () {
@@ -240,14 +394,15 @@ add_task(async function test_domain_extraction_heuristics() {
     let expectedDomains = new Set(currentTest.expectedDomains);
     let actualDomains = await SpecialPowers.spawn(
       gBrowser.selectedBrowser,
-      [currentTest.extractorInfos],
-      extractorInfos => {
+      [currentTest.extractorInfos, SEARCH_PROVIDER_NAME],
+      (extractorInfos, searchProviderName) => {
         const { domainExtractor } = ChromeUtils.importESModule(
           "resource:///actors/SearchSERPTelemetryChild.sys.mjs"
         );
         return domainExtractor.extractDomainsFromDocument(
           content.document,
-          extractorInfos
+          extractorInfos,
+          searchProviderName
         );
       }
     );