/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; const lazy = {}; ChromeUtils.defineESModuleGetters(lazy, { clearTimeout: "resource://gre/modules/Timer.sys.mjs", setTimeout: "resource://gre/modules/Timer.sys.mjs", }); XPCOMUtils.defineLazyPreferenceGetter( lazy, "serpEventsEnabled", "browser.search.serpEventTelemetry.enabled", true ); XPCOMUtils.defineLazyPreferenceGetter( lazy, "serpEventTelemetryCategorization", "browser.search.serpEventTelemetryCategorization.enabled", false ); export const CATEGORIZATION_SETTINGS = { MAX_DOMAINS_TO_CATEGORIZE: 10, }; // Duplicated from SearchSERPTelemetry to avoid loading the module on content // startup. const SEARCH_TELEMETRY_SHARED = { PROVIDER_INFO: "SearchTelemetry:ProviderInfo", LOAD_TIMEOUT: "SearchTelemetry:LoadTimeout", SPA_LOAD_TIMEOUT: "SearchTelemetry:SPALoadTimeout", }; /** * Standard events mapped to the telemetry action. */ const EVENT_TYPE_TO_ACTION = { click: "clicked", }; /** * A map of object conditions mapped to the condition that should be run when * an event is triggered. The condition name is referenced in Remote Settings * under the optional `condition` string for an event listener. */ const CONDITIONS = { keydownEnter: event => event.key == "Enter", }; /** * SearchProviders looks after keeping track of the search provider information * received from the main process. * * It is separate to SearchTelemetryChild so that it is not constructed for each * tab, but once per process. */ class SearchProviders { constructor() { this._searchProviderInfo = null; Services.cpmm.sharedData.addEventListener("change", this); } /** * Gets the search provider information for any provider with advert information. * If there is nothing in the cache, it will obtain it from shared data. * * @returns {object} Returns the search provider information. * @see SearchTelemetry.sys.mjs */ get info() { if (this._searchProviderInfo) { return this._searchProviderInfo; } this._searchProviderInfo = Services.cpmm.sharedData.get( SEARCH_TELEMETRY_SHARED.PROVIDER_INFO ); if (!this._searchProviderInfo) { return null; } this._searchProviderInfo = this._searchProviderInfo // Filter-out non-ad providers so that we're not trying to match against // those unnecessarily. .filter(p => "extraAdServersRegexps" in p) // Pre-build the regular expressions. .map(p => { p.adServerAttributes = p.adServerAttributes ?? []; if (p.shoppingTab?.inspectRegexpInSERP) { p.shoppingTab.regexp = new RegExp(p.shoppingTab.regexp); } return { ...p, searchPageRegexp: new RegExp(p.searchPageRegexp), extraAdServersRegexps: p.extraAdServersRegexps.map( r => new RegExp(r) ), }; }); return this._searchProviderInfo; } /** * Handles events received from sharedData notifications. * * @param {object} event The event details. */ handleEvent(event) { switch (event.type) { case "change": { if (event.changedKeys.includes(SEARCH_TELEMETRY_SHARED.PROVIDER_INFO)) { // Just null out the provider information for now, we'll fetch it next // time we need it. this._searchProviderInfo = null; } break; } } } } /** * @typedef {object} EventListenerParam * @property {string} eventType * The type of event the listener should listen for. If the event type is * is non-standard, it should correspond to a definition in * CUSTOM_EVENT_TYPE_TO_DATA that will re-map it to a standard type. TODO * @property {string} target * The type of component that was the source of the event. * @property {string | null} action * The action that should be reported in telemetry. */ /** * Provides a way to add listeners to elements, as well as unload them. */ class ListenerHelper { /** * Adds each event listener in an array of event listeners to each element * in an array of elements, and sets their unloading. * * @param {Array} elements * DOM elements to add event listeners to. * @param {Array} eventListenerParams * The type of event to add the listener to. * @param {string} target */ static addListeners(elements, eventListenerParams, target) { if (!elements?.length || !eventListenerParams?.length) { return; } let document = elements[0].ownerGlobal.document; let callback = documentToEventCallbackMap.get(document); if (!callback) { return; } // The map might have entries from previous callers, so we must ensure // we don't discard existing event listener callbacks. let removeListenerCallbacks = []; if (documentToRemoveEventListenersMap.has(document)) { removeListenerCallbacks = documentToRemoveEventListenersMap.get(document); } for (let params of eventListenerParams) { let removeListeners = ListenerHelper.addListener( elements, params, target, callback ); removeListenerCallbacks = removeListenerCallbacks.concat(removeListeners); } documentToRemoveEventListenersMap.set(document, removeListenerCallbacks); } /** * Add an event listener to each element in an array of elements. * * @param {Array} elements * DOM elements to add event listeners to. * @param {EventListenerParam} eventListenerParam * @param {string} target * @param {Function} callback * @returns {Array} Array of remove event listener functions. */ static addListener(elements, eventListenerParam, target, callback) { let { action, eventType, target: customTarget } = eventListenerParam; if (customTarget) { target = customTarget; } if (!action) { action = EVENT_TYPE_TO_ACTION[eventType]; if (!action) { return []; } } // Some events might have specific conditions we want to check before // registering an engagement event. let eventCallback; if (eventListenerParam.condition) { if (CONDITIONS[eventListenerParam.condition]) { let condition = CONDITIONS[eventListenerParam.condition]; eventCallback = async event => { let start = Cu.now(); if (condition(event)) { callback({ action, target }); } ChromeUtils.addProfilerMarker( "SearchSERPTelemetryChild._eventCallback", start, "Call cached function before callback." ); }; } else { // If a component included a condition, but it wasn't found it is // due to the fact that it was added in a more recent Firefox version // than what is provided via search-telemetry-v2. Since the version of // Firefox the user is using doesn't include this condition, // we shouldn't add the event. return []; } } else { eventCallback = () => { callback({ action, target }); }; } let removeListenerCallbacks = []; for (let element of elements) { element.addEventListener(eventType, eventCallback); removeListenerCallbacks.push(() => { element.removeEventListener(eventType, eventCallback); }); } return removeListenerCallbacks; } } /** * Scans SERPs for ad components. */ class SearchAdImpression { /** * A reference to ad component information that is used if an anchor * element could not be categorized to a specific ad component. * * @type {object} */ #defaultComponent = null; /** * Maps DOM elements to AdData. * * @type {Map} * * @typedef AdData * @type {object} * @property {string} type * The type of ad component. * @property {number} adsLoaded * The number of ads counted as loaded for the component. * @property {boolean} countChildren * Whether all the children were counted for the component. */ #elementToAdDataMap = new Map(); /** * An array of components to do a top-down search. */ #topDownComponents = []; /** * A reference the providerInfo for this SERP. * * @type {object} */ #providerInfo = null; set providerInfo(providerInfo) { if (this.#providerInfo?.telemetryId == providerInfo.telemetryId) { return; } this.#providerInfo = providerInfo; // Reset values. this.#topDownComponents = []; for (let component of this.#providerInfo.components) { if (component.default) { this.#defaultComponent = component; continue; } if (component.topDown) { this.#topDownComponents.push(component); } } } /** * Check if the page has a shopping tab. * * @param {Document} document * @return {boolean} * Whether the page has a shopping tab. Defaults to false. */ hasShoppingTab(document) { if (!this.#providerInfo?.shoppingTab) { return false; } // If a provider has the inspectRegexpInSERP, we assume there must be an // associated regexp that must be used on any hrefs matched by the elements // found using the selector. If inspectRegexpInSERP is false, then check if // the number of items found using the selector matches exactly one element // to ensure we've used a fine-grained search. let elements = document.querySelectorAll( this.#providerInfo.shoppingTab.selector ); if (this.#providerInfo.shoppingTab.inspectRegexpInSERP) { let regexp = this.#providerInfo.shoppingTab.regexp; for (let element of elements) { let href = element.getAttribute("href"); if (href && regexp.test(href)) { this.#recordElementData(element, { type: "shopping_tab", count: 1, }); return true; } } } else if (elements.length == 1) { this.#recordElementData(elements[0], { type: "shopping_tab", count: 1, }); return true; } return false; } /** * Examine the list of anchors and the document object and find components * on the page. * * With the list of anchors, go through each and find the component it * belongs to and save it in elementToAdDataMap. * * Then, with the document object find components and save the results to * elementToAdDataMap. * * Lastly, combine the results together in a new Map that contains the number * of loaded, visible, and blocked results for the component. * * @param {HTMLCollectionOf} anchors * @param {Document} document * * @returns {Map} * A map where the key is a string containing the type of ad component * and the value is an object containing the number of adsLoaded, * adsVisible, and adsHidden within the component. */ categorize(anchors, document) { // Used for various functions to make relative URLs absolute. let origin = new URL(document.documentURI).origin; // Bottom up approach. this.#categorizeAnchors(anchors, origin); // Top down approach. this.#categorizeDocument(document); let componentToVisibilityMap = new Map(); let hrefToComponentMap = new Map(); let innerWindowHeight = document.ownerGlobal.innerHeight; let scrollY = document.ownerGlobal.scrollY; // Iterate over the results: // - If it's searchbox add event listeners. // - If it is a non_ads_link, map its href to component type. // - For others, map its component type and check visibility. for (let [element, data] of this.#elementToAdDataMap.entries()) { if (data.type == "incontent_searchbox") { // Bug 1880413: Deprecate hard coding the incontent search box. // If searchbox has child elements, observe those, otherwise // fallback to its parent element. let searchElements = data.childElements.length ? data.childElements : [element]; ListenerHelper.addListeners( searchElements, [ { eventType: "click", target: data.type }, { eventType: "keydown", target: data.type, action: "submitted", condition: "keydownEnter", }, ], data.type ); continue; } if (data.childElements.length) { for (let child of data.childElements) { let href = this.#extractHref(child, origin); if (href) { hrefToComponentMap.set(href, data.type); } } } else { let href = this.#extractHref(element, origin); if (href) { hrefToComponentMap.set(href, data.type); } } // If the component is a non_ads_link, skip visibility checks. if (data.type == "non_ads_link") { continue; } // If proxy children were found, check the visibility of all of them // otherwise just check the visiblity of the first child. let childElements; if (data.proxyChildElements.length) { childElements = data.proxyChildElements; } else if (data.childElements.length) { childElements = [data.childElements[0]]; } let count = this.#countVisibleAndHiddenAds( element, data.adsLoaded, childElements, innerWindowHeight, scrollY ); if (componentToVisibilityMap.has(data.type)) { let componentInfo = componentToVisibilityMap.get(data.type); componentInfo.adsLoaded += data.adsLoaded; componentInfo.adsVisible += count.adsVisible; componentInfo.adsHidden += count.adsHidden; } else { componentToVisibilityMap.set(data.type, { adsLoaded: data.adsLoaded, adsVisible: count.adsVisible, adsHidden: count.adsHidden, }); } } // Release the DOM elements from the Map. this.#elementToAdDataMap.clear(); return { componentToVisibilityMap, hrefToComponentMap }; } /** * Given an element, find the href that is most likely to make the request if * the element is clicked. If the element contains a specific data attribute * known to contain the url used to make the initial request, use it, * otherwise use its href. Specific character conversions are done to mimic * conversions likely to take place when urls are observed in network * activity. * * @param {Element} element * The element to inspect. * @param {string} origin * The origin for relative urls. * @returns {string} * The href of the element. */ #extractHref(element, origin) { let href; // Prioritize the href from a known data attribute value instead of // its href property, as the former is the initial url the page will // navigate to before being re-directed to the href. for (let name of this.#providerInfo.adServerAttributes) { if ( element.dataset[name] && this.#providerInfo.extraAdServersRegexps.some(regexp => regexp.test(element.dataset[name]) ) ) { href = element.dataset[name]; break; } } // If a data attribute value was not found, fallback to the href. href = href ?? element.getAttribute("href"); if (!href) { return ""; } // Avoid extracting or fixing up Javascript URLs. if (href.startsWith("javascript")) { return ""; } // Hrefs can be relative. if (!href.startsWith("https://") && !href.startsWith("http://")) { href = origin + href; } // Per Bug 376844, apostrophes in query params are escaped, and thus, are // percent-encoded by the time they are observed in the network. Even // though it's more comprehensive, we avoid using newURI because its more // expensive and conversions should be the exception. // e.g. /path'?q=Mozilla's -> /path'?q=Mozilla%27s let arr = href.split("?"); if (arr.length == 2 && arr[1].includes("'")) { href = arr[0] + "?" + arr[1].replaceAll("'", "%27"); } return href; } /** * Given a list of anchor elements, group them into ad components. * * The first step in the process is to check if the anchor should be * inspected. This is based on whether it contains an href or a * data-attribute values that matches an ad link, or if it contains a * pattern caught by a components included regular expression. * * Determine which component it belongs to and the number of matches for * the component. The heuristic is described in findDataForAnchor. * If there was a result and we haven't seen it before, save it in * elementToAdDataMap. * * @param {HTMLCollectionOf} anchors * The list of anchors to inspect. * @param {string} origin * The origin of the document the anchors belong to. */ #categorizeAnchors(anchors, origin) { for (let anchor of anchors) { if (this.#shouldInspectAnchor(anchor, origin)) { let result = this.#findDataForAnchor(anchor); if (result) { this.#recordElementData(result.element, { type: result.type, count: result.count, proxyChildElements: result.proxyChildElements, childElements: result.childElements, }); } if (result.relatedElements?.length) { // Bug 1880413: Deprecate related elements. // Bottom-up approach with related elements are only used for // non-link elements related to ads, like carousel arrows. ListenerHelper.addListeners( result.relatedElements, [ { action: "expanded", eventType: "click", }, ], result.type ); } } } } /** * Find components from the document object. This is mostly relevant for * components that are non-ads and don't have an obvious regular expression * that could match the pattern of the href. * * @param {Document} document */ #categorizeDocument(document) { // using the subset of components that are top down, // go through each one. for (let component of this.#topDownComponents) { // Top-down searches must have the topDown attribute. if (!component.topDown) { continue; } // Top down searches must include a parent. if (!component.included?.parent) { continue; } let parents = document.querySelectorAll( component.included.parent.selector ); if (parents.length) { let eventListeners = component.included.parent.eventListeners; if (eventListeners?.length) { ListenerHelper.addListeners(parents, eventListeners, component.type); } for (let parent of parents) { // Bug 1880413: Deprecate related elements. // Top-down related elements are either used for auto-suggested // elements of a searchbox, or elements on a page which we can't // find through a bottom up approach but we want an add a listener, // like carousels with arrows. if (component.included.related?.selector) { let relatedElements = parent.querySelectorAll( component.included.related.selector ); if (relatedElements.length) { // For the search box, related elements with event listeners are // auto-suggested terms. For everything else (e.g. carousels) // they are expanded. ListenerHelper.addListeners( relatedElements, [ { action: component.type == "incontent_searchbox" ? "submitted" : "expanded", eventType: "click", }, ], component.type ); } } if (component.included.children) { for (let child of component.included.children) { let childElements = parent.querySelectorAll(child.selector); if (childElements.length) { if (child.eventListeners) { childElements = Array.from(childElements); ListenerHelper.addListeners( childElements, child.eventListeners, child.type ?? component.type ); } if (!child.skipCount) { this.#recordElementData(parent, { type: component.type, childElements: Array.from(childElements), }); } } } } else if (!component.included.parent.skipCount) { this.#recordElementData(parent, { type: component.type, }); } } } } } /** * Evaluates whether an anchor should be inspected based on matching * regular expressions on either its href or specified data-attribute values. * * @param {HTMLAnchorElement} anchor * @param {string} origin * @returns {boolean} */ #shouldInspectAnchor(anchor, origin) { let href = anchor.getAttribute("href"); if (!href) { return false; } // Some hrefs might be relative. if (!href.startsWith("https://") && !href.startsWith("http://")) { href = origin + href; } let regexps = this.#providerInfo.extraAdServersRegexps; // Anchors can contain ad links in a data-attribute. for (let name of this.#providerInfo.adServerAttributes) { let attributeValue = anchor.dataset[name]; if ( attributeValue && regexps.some(regexp => regexp.test(attributeValue)) ) { return true; } } // Anchors can contain ad links in a specific href. if (regexps.some(regexp => regexp.test(href))) { return true; } return false; } /** * Find the component data for an anchor. * * To categorize the anchor, we iterate over the list of possible components * the anchor could be categorized. If the component is default, we skip * checking because the fallback option for all anchor links is the default. * * First, get the "parent" of the anchor which best represents the DOM element * that contains the anchor links for the component and no other component. * This parent will be cached so that other anchors that share the same * parent can be counted together. * * The check for a parent is a loop because we can define more than one best * parent since on certain SERPs, it's possible for a "better" DOM element * parent to appear occassionally. * * If no parent is found, skip this component. * * If a parent was found, check for specific child elements. * * Finding child DOM elements of a parent is optional. One reason to do so is * to use child elements instead of anchor links to count the number of ads for * a component via the `countChildren` property. This is provided because some ads * (i.e. carousels) have multiple ad links in a single child element that go to the * same location. In this scenario, all instances of the child are recorded as ads. * Subsequent anchor elements that map to the same parent are ignored. * * Whether or not a child was found, return the information that was found, * including whether or not all child elements were counted instead of anchors. * * If another anchor belonging to a parent that was previously recorded is the input * for this function, we either increment the ad count by 1 or don't increment the ad * count because the parent used `countChildren` completed the calculation in a * previous step. * * * @param {HTMLAnchorElement} anchor * The anchor to be inspected. * @returns {object} * An object containing the element representing the root DOM element for * the component, the type of component, how many ads were counted, * and whether or not the count was of all the children. */ #findDataForAnchor(anchor) { for (let component of this.#providerInfo.components) { // First, check various conditions for skipping a component. // A component should always have at least one included statement. if (!component.included) { continue; } // Top down searches are done after the bottom up search. if (component.topDown) { continue; } // The default component doesn't need to be checked, // as it will be the fallback option. if (component.default) { continue; } // The anchor shouldn't belong to an excluded parent component if one // is provided. if ( component.excluded?.parent?.selector && anchor.closest(component.excluded.parent.selector) ) { continue; } // All components with included should have a parent entry. if (!component.included.parent) { continue; } // Find the parent of the anchor. let parent = anchor.closest(component.included.parent.selector); if (!parent) { continue; } // If we've already inspected the parent, add the child element to the // list of anchors. Don't increment the ads loaded count, as we only care // about grouping the anchor with the correct parent. if (this.#elementToAdDataMap.has(parent)) { return { element: parent, childElements: [anchor], }; } let relatedElements = []; if (component.included.related?.selector) { relatedElements = parent.querySelectorAll( component.included.related.selector ); } // If the component has no defined children, return the parent element. if (component.included.children) { // Look for the first instance of a matching child selector. for (let child of component.included.children) { // If counting by child, get all of them at once. if (child.countChildren) { let proxyChildElements = parent.querySelectorAll(child.selector); if (proxyChildElements.length) { return { element: parent, type: child.type ?? component.type, proxyChildElements: Array.from(proxyChildElements), count: proxyChildElements.length, childElements: [anchor], relatedElements, }; } } else if (parent.querySelector(child.selector)) { return { element: parent, type: child.type ?? component.type, childElements: [anchor], relatedElements, }; } } } // If no children were defined for this component, or none were found // in the DOM, use the default definition. return { element: parent, type: component.type, childElements: [anchor], relatedElements, }; } // If no component was found, use default values. return { element: anchor, type: this.#defaultComponent.type, }; } /** * Determines whether or not an ad was visible or hidden. * * An ad is considered visible if the parent element containing the * component has non-zero dimensions, and all child element in the * component have non-zero dimensions and fits within the window * at the time when the impression was takent. * * For some components, like text ads, we don't send every child * element for visibility, just the first text ad. For other components * like carousels, we send all child elements because we do care about * counting how many elements of the carousel were visible. * * @param {Element} element * Element to be inspected * @param {number} adsLoaded * Number of ads initially determined to be loaded for this element. * @param {Array} childElements * List of children belonging to element. * @param {number} innerWindowHeight * Current height of the window containing the elements. * @param {number} scrollY * Current distance the window has been scrolled. * @returns {object} * Contains adsVisible which is the number of ads shown for the element * and adsHidden, the number of ads not visible to the user. */ #countVisibleAndHiddenAds( element, adsLoaded, childElements, innerWindowHeight, scrollY ) { let elementRect = element.ownerGlobal.windowUtils.getBoundsWithoutFlushing(element); // If the element lacks a dimension, assume all ads that // were contained within it are hidden. if (elementRect.width == 0 || elementRect.height == 0) { return { adsVisible: 0, adsHidden: adsLoaded, }; } // If an ad is far above the possible visible area of a window, an // adblocker might be doing it as a workaround for blocking the ad. if ( elementRect.bottom < 0 && innerWindowHeight + scrollY + elementRect.bottom < 0 ) { return { adsVisible: 0, adsHidden: adsLoaded, }; } // Since the parent element has dimensions but no child elements we want // to inspect, check the parent itself is within the viewable area. if (!childElements || !childElements.length) { if (innerWindowHeight < elementRect.y + elementRect.height) { return { adsVisible: 0, adsHidden: 0, }; } return { adsVisible: 1, adsHidden: 0, }; } let adsVisible = 0; let adsHidden = 0; for (let child of childElements) { let itemRect = child.ownerGlobal.windowUtils.getBoundsWithoutFlushing(child); // If the child element we're inspecting has no dimension, it is hidden. if (itemRect.height == 0 || itemRect.width == 0) { adsHidden += 1; continue; } // If the child element is to the left of the containing element, or to // the right of the containing element, skip it. if ( itemRect.x < elementRect.x || itemRect.x + itemRect.width > elementRect.x + elementRect.width ) { continue; } // If the child element is too far down, skip it. if (innerWindowHeight < itemRect.y + itemRect.height) { continue; } ++adsVisible; } return { adsVisible, adsHidden, }; } /** * Caches ad data for a DOM element. The key of the map is by Element rather * than Component for fast lookup on whether an Element has been already been * categorized as a component. Subsequent calls to this passing the same * element will update the list of child elements. * * @param {Element} element * The element considered to be the root for the component. * @param {object} params * Various parameters that can be recorded. Whether the input values exist * or not depends on which component was found, which heuristic should be used * to determine whether an ad was visible, and whether we've already seen this * element. * @param {string | null} params.type * The type of component. * @param {number} params.count * The number of ads found for a component. The number represents either * the number of elements that match an ad expression or the number of DOM * elements containing an ad link. * @param {Array} params.proxyChildElements * An array of DOM elements that should be inspected for visibility instead * of the actual child elements, possibly because they are grouped. * @param {Array} params.childElements * An array of DOM elements to inspect. */ #recordElementData( element, { type, count = 1, proxyChildElements = [], childElements = [] } = {} ) { if (this.#elementToAdDataMap.has(element)) { let recordedValues = this.#elementToAdDataMap.get(element); if (childElements.length) { recordedValues.childElements = recordedValues.childElements.concat(childElements); } } else { this.#elementToAdDataMap.set(element, { type, adsLoaded: count, proxyChildElements, childElements, }); } } } /** * An object indicating which elements to examine for domains to extract and * which heuristic technique to use to extract that element's domain. * * @typedef {object} ExtractorInfo * @property {string} selectors * A string representing the CSS selector that targets the elements on the * page that contain domains we want to extract. * @property {string} method * A string representing which domain extraction heuristic to use. * One of: "href", "dataAttribute" or "textContent". * @property {object | null} options * Options related to the domain extraction heuristic used. * @property {string | null} options.dataAttributeKey * The key name of the data attribute to lookup. * @property {string | null} options.queryParamKey * The key name of the query param value to lookup. * @property {boolean | null} options.queryParamValueIsHref * Whether the query param value is expected to contain an href. */ /** * DomainExtractor examines elements on a page to retrieve the domains. */ class DomainExtractor { /** * Extract domains from the page using an array of information pertaining to * the SERP. * * @param {Document} document * The document for the SERP we are extracting domains from. * @param {Array} extractorInfos * Information used to target the domains we need to extract. * @param {string} providerName * Name of the search provider. * @return {Set} * A set of the domains extracted from the page. */ extractDomainsFromDocument(document, extractorInfos, providerName) { let extractedDomains = new Set(); if (!extractorInfos?.length) { return extractedDomains; } for (let extractorInfo of extractorInfos) { if (!extractorInfo.selectors) { continue; } let elements = document.querySelectorAll(extractorInfo.selectors); if (!elements) { continue; } switch (extractorInfo.method) { case "href": { // Origin is used in case a URL needs to be made absolute. let origin = new URL(document.documentURI).origin; this.#fromElementsConvertHrefsIntoDomains( elements, origin, providerName, extractedDomains, extractorInfo.options?.queryParamKey, extractorInfo.options?.queryParamValueIsHref ); break; } case "dataAttribute": { this.#fromElementsRetrieveDataAttributeValues( elements, providerName, extractorInfo.options?.dataAttributeKey, extractedDomains ); break; } case "textContent": { this.#fromElementsRetrieveTextContent(elements, extractedDomains); break; } } } return extractedDomains; } /** * Given a list of elements, extract domains using href attributes. If the * URL in the href includes the specified query param, the domain will be * that query param's value. Otherwise it will be the hostname of the href * attribute's URL. * * @param {NodeList} elements * A list of elements from the page whose href attributes we want to * inspect. * @param {string} origin * Origin of the current page. * @param {string} providerName * The name of the search provider. * @param {Set} extractedDomains * The result set of domains extracted from the page. * @param {string | null} queryParam * An optional query param to search for in an element's href attribute. * @param {boolean | null} queryParamValueIsHref * Whether the query param value is expected to contain an href. */ #fromElementsConvertHrefsIntoDomains( elements, origin, providerName, extractedDomains, queryParam, queryParamValueIsHref ) { for (let element of elements) { if (this.#exceedsThreshold(extractedDomains.size)) { return; } let href = element.getAttribute("href"); let url; try { url = new URL(href, origin); } catch (ex) { continue; } // Ignore non-standard protocols. if (url.protocol != "https:" && url.protocol != "http:") { continue; } if (queryParam) { let paramValue = url.searchParams.get(queryParam); if (queryParamValueIsHref) { try { paramValue = new URL(paramValue).hostname; } catch (e) { continue; } paramValue = this.#processDomain(paramValue, providerName); } if (paramValue && !extractedDomains.has(paramValue)) { extractedDomains.add(paramValue); } } else if (url.hostname) { let processedHostname = this.#processDomain(url.hostname, providerName); if (processedHostname && !extractedDomains.has(processedHostname)) { extractedDomains.add(processedHostname); } } } } /** * Given a list of elements, examine each for the specified data attribute. * If found, add that data attribute's value to the result set of extracted * domains as is. * * @param {NodeList} elements * A list of elements from the page whose data attributes we want to * inspect. * @param {string} providerName * The name of the search provider. * @param {string} attribute * The name of a data attribute to search for within an element. * @param {Set} extractedDomains * The result set of domains extracted from the page. */ #fromElementsRetrieveDataAttributeValues( elements, providerName, attribute, extractedDomains ) { for (let element of elements) { if (this.#exceedsThreshold(extractedDomains.size)) { return; } let value = element.dataset[attribute]; value = this.#processDomain(value, providerName); if (value && !extractedDomains.has(value)) { extractedDomains.add(value); } } } /* Given a list of elements, examine the text content for each element, which * may be 1) a URL from which we can extract a domain or 2) text we can fix * up to create a best guess as to a URL. If either condition is met, we add * the domain to the result set. * * @param {NodeList} elements * A list of elements from the page whose text content we want to inspect. * @param {Set} extractedDomains * The result set of domains extracted from the page. */ #fromElementsRetrieveTextContent(elements, extractedDomains) { for (let element of elements) { if (this.#exceedsThreshold(extractedDomains.size)) { return; } let textContent = element.textContent; if (!textContent) { continue; } let domain; try { domain = new URL(textContent).hostname; } catch (e) { domain = textContent.toLowerCase().replaceAll(" ", ""); // If the attempt to turn the text content into a URL object only fails // because we're missing a protocol, ".com" may already be present. if (!domain.endsWith(".com")) { domain = domain.concat(".com"); } } if (!extractedDomains.has(domain)) { extractedDomains.add(domain); } } } /** * Processes a raw domain extracted from the SERP into its final form before * categorization. * * @param {string} domain * The domain extracted from the page. * @param {string} providerName * The provider associated with the page. * @returns {string} * The domain without any subdomains. */ #processDomain(domain, providerName) { if ( domain.startsWith(`${providerName}.`) || domain.includes(`.${providerName}.`) ) { return ""; } return this.#stripDomainOfSubdomains(domain); } /** * Helper to strip domains of any subdomains. * * @param {string} domain * The domain to strip of any subdomains. * @returns {object} browser * The given domain with any subdomains removed. */ #stripDomainOfSubdomains(domain) { let tld; // Can throw an exception if the input has too few domain levels. try { tld = Services.eTLD.getKnownPublicSuffixFromHost(domain); } catch (ex) { return ""; } let domainWithoutTLD = domain.substring(0, domain.length - tld.length); let secondLevelDomain = domainWithoutTLD.split(".").at(-2); return secondLevelDomain ? `${secondLevelDomain}.${tld}` : ""; } /** * Per a request from Data Science, we need to limit the number of domains * categorized to 10 non-ad domains and 10 ad domains. * * @param {number} nDomains The number of domains processed. * @returns {boolean} Whether or not the threshold was exceeded. */ #exceedsThreshold(nDomains) { return nDomains >= CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE; } } export const domainExtractor = new DomainExtractor(); const searchProviders = new SearchProviders(); const searchAdImpression = new SearchAdImpression(); const documentToEventCallbackMap = new WeakMap(); const documentToRemoveEventListenersMap = new WeakMap(); const documentToSubmitMap = new WeakMap(); /** * SearchTelemetryChild monitors for pages that are partner searches, and * looks through them to find links which looks like adverts and sends back * a notification to SearchTelemetry for possible telemetry reporting. * * Only the partner details and the fact that at least one ad was found on the * page are returned to SearchTelemetry. If no ads are found, no notification is * given. */ export class SearchSERPTelemetryChild extends JSWindowActorChild { /** * Amount of time to wait after a page event before examining the page * for ads. * * @type {number | null} */ #adTimeout; /** * Determines if there is a provider that matches the supplied URL and returns * the information associated with that provider. * * @param {string} url The url to check * @returns {array|null} Returns null if there's no match, otherwise an array * of provider name and the provider information. */ _getProviderInfoForUrl(url) { return searchProviders.info?.find(info => info.searchPageRegexp.test(url)); } /** * Checks to see if the page is a partner and has an ad link within it. If so, * it will notify SearchTelemetry. */ _checkForAdLink(eventType) { try { if (!this.contentWindow) { return; } } catch (ex) { // unload occurred before the timer expired return; } let doc = this.document; let url = doc.documentURI; let providerInfo = this._getProviderInfoForUrl(url); if (!providerInfo) { return; } let regexps = providerInfo.extraAdServersRegexps; let anchors = doc.getElementsByTagName("a"); let hasAds = false; for (let anchor of anchors) { if (!anchor.href) { continue; } for (let name of providerInfo.adServerAttributes) { hasAds = regexps.some(regexp => regexp.test(anchor.dataset[name])); if (hasAds) { break; } } if (!hasAds) { hasAds = regexps.some(regexp => regexp.test(anchor.href)); } if (hasAds) { break; } } if (hasAds) { this.sendAsyncMessage("SearchTelemetry:PageInfo", { hasAds, url, }); } if ( lazy.serpEventsEnabled && providerInfo.components?.length && (eventType == "load" || eventType == "pageshow") ) { // Start performance measurements. let start = Cu.now(); let timerId = Glean.serp.categorizationDuration.start(); let pageActionCallback = info => { if (info.action == "submitted") { documentToSubmitMap.set(doc, true); } this.sendAsyncMessage("SearchTelemetry:Action", { target: info.target, url: info.url, action: info.action, }); }; documentToEventCallbackMap.set(this.document, pageActionCallback); let componentToVisibilityMap, hrefToComponentMap; try { let result = searchAdImpression.categorize(anchors, doc); componentToVisibilityMap = result.componentToVisibilityMap; hrefToComponentMap = result.hrefToComponentMap; } catch (e) { // Cancel the timer if an error encountered. Glean.serp.categorizationDuration.cancel(timerId); } if (componentToVisibilityMap && hrefToComponentMap) { // End measurements. ChromeUtils.addProfilerMarker( "SearchSERPTelemetryChild._checkForAdLink", start, "Checked anchors for visibility" ); Glean.serp.categorizationDuration.stopAndAccumulate(timerId); this.sendAsyncMessage("SearchTelemetry:AdImpressions", { adImpressions: componentToVisibilityMap, hrefToComponentMap, url, }); } } if ( lazy.serpEventTelemetryCategorization && providerInfo.domainExtraction && (eventType == "load" || eventType == "pageshow") ) { let start = Cu.now(); let nonAdDomains = domainExtractor.extractDomainsFromDocument( doc, providerInfo.domainExtraction.nonAds, providerInfo.telemetryId ); let adDomains = domainExtractor.extractDomainsFromDocument( doc, providerInfo.domainExtraction.ads, providerInfo.telemetryId ); this.sendAsyncMessage("SearchTelemetry:Domains", { url, nonAdDomains, adDomains, }); ChromeUtils.addProfilerMarker( "SearchSERPTelemetryChild._checkForAdLink", start, "Extract domains from elements" ); } } /** * Checks for the presence of certain components on the page that are * required for recording the page impression. */ #checkForPageImpressionComponents() { let url = this.document.documentURI; let providerInfo = this._getProviderInfoForUrl(url); if (providerInfo.components?.length) { searchAdImpression.providerInfo = providerInfo; let start = Cu.now(); let shoppingTabDisplayed = searchAdImpression.hasShoppingTab( this.document ); ChromeUtils.addProfilerMarker( "SearchSERPTelemetryChild.#recordImpression", start, "Checked for shopping tab" ); this.sendAsyncMessage("SearchTelemetry:PageImpression", { url, shoppingTabDisplayed, }); } } #removeEventListeners() { let callbacks = documentToRemoveEventListenersMap.get(this.document); if (callbacks) { for (let callback of callbacks) { callback(); } documentToRemoveEventListenersMap.delete(this.document); } } /** * Handles events received from the actor child notifications. * * @param {object} event The event details. */ handleEvent(event) { if (!this.#urlIsSERP(this.document.documentURI)) { return; } switch (event.type) { case "pageshow": { // If a page is loaded from the bfcache, we won't get a "DOMContentLoaded" // event, so we need to rely on "pageshow" in this case. Note: we do this // so that we remain consistent with the *.in-content:sap* count for the // SEARCH_COUNTS histogram. if (event.persisted) { this.#check(event.type); if (lazy.serpEventsEnabled) { this.#checkForPageImpressionComponents(); } } break; } case "DOMContentLoaded": { if (lazy.serpEventsEnabled) { this.#checkForPageImpressionComponents(); } this.#check(event.type); break; } case "load": { // We check both DOMContentLoaded and load in case the page has // taken a long time to load and the ad is only detected on load. // We still check at DOMContentLoaded because if the page hasn't // finished loading and the user navigates away, we still want to know // if there were ads on the page or not at that time. this.#check(event.type); break; } case "pagehide": { let callbacks = documentToRemoveEventListenersMap.get(this.document); if (callbacks) { for (let removeEventListenerCallback of callbacks) { removeEventListenerCallback(); } documentToRemoveEventListenersMap.delete(this.document); } this.#cancelCheck(); break; } } } async receiveMessage(message) { switch (message.name) { case "SearchSERPTelemetry:WaitForSPAPageLoad": lazy.setTimeout(() => { this.#checkForPageImpressionComponents(); this._checkForAdLink("load"); }, Services.cpmm.sharedData.get(SEARCH_TELEMETRY_SHARED.SPA_LOAD_TIMEOUT)); break; case "SearchSERPTelemetry:StopTrackingDocument": this.#removeDocumentFromSubmitMap(); this.#removeEventListeners(); break; case "SearchSERPTelemetry:DidSubmit": return this.#didSubmit(); } return null; } #didSubmit() { return documentToSubmitMap.get(this.document); } #removeDocumentFromSubmitMap() { documentToSubmitMap.delete(this.document); } #urlIsSERP(url) { let provider = this._getProviderInfoForUrl(this.document.documentURI); if (provider) { // Some URLs can match provider info but also be the provider's homepage // instead of a SERP. // e.g. https://example.com/ vs. https://example.com/?foo=bar // To check this, we look for the presence of the query parameter // that contains a search term. let queries = new URLSearchParams(url.split("#")[0].split("?")[1]); for (let queryParamName of provider.queryParamNames) { if (queries.get(queryParamName)) { return true; } } } return false; } #cancelCheck() { if (this._waitForContentTimeout) { lazy.clearTimeout(this._waitForContentTimeout); } } #check(eventType) { if (!this.#adTimeout) { this.#adTimeout = Services.cpmm.sharedData.get( SEARCH_TELEMETRY_SHARED.LOAD_TIMEOUT ); } this.#cancelCheck(); this._waitForContentTimeout = lazy.setTimeout(() => { this._checkForAdLink(eventType); }, this.#adTimeout); } }