From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 19:32:43 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- browser/components/newtab/lib/SiteClassifier.jsm | 99 ++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 browser/components/newtab/lib/SiteClassifier.jsm (limited to 'browser/components/newtab/lib/SiteClassifier.jsm') diff --git a/browser/components/newtab/lib/SiteClassifier.jsm b/browser/components/newtab/lib/SiteClassifier.jsm new file mode 100644 index 0000000000..0b174676b9 --- /dev/null +++ b/browser/components/newtab/lib/SiteClassifier.jsm @@ -0,0 +1,99 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +"use strict"; + +const { RemoteSettings } = ChromeUtils.importESModule( + "resource://services-settings/remote-settings.sys.mjs" +); + +// Returns whether the passed in params match the criteria. +// To match, they must contain all the params specified in criteria and the values +// must match if a value is provided in criteria. +function _hasParams(criteria, params) { + for (let param of criteria) { + const val = params.get(param.key); + if ( + val === null || + (param.value && param.value !== val) || + (param.prefix && !val.startsWith(param.prefix)) + ) { + return false; + } + } + return true; +} + +/** + * classifySite + * Classifies a given URL into a category based on classification data from RemoteSettings. + * The data from remote settings can match a category by one of the following: + * - match the exact URL + * - match the hostname or second level domain (sld) + * - match query parameter(s), and optionally their values or prefixes + * - match both (hostname or sld) and query parameter(s) + * + * The data looks like: + * [{ + * "type": "hostname-and-params-match", + * "criteria": [ + * { + * "url": "https://matchurl.com", + * "hostname": "matchhostname.com", + * "sld": "secondleveldomain", + * "params": [ + * { + * "key": "matchparam", + * "value": "matchvalue", + * "prefix": "matchpPrefix", + * }, + * ], + * }, + * ], + * "weight": 300, + * },...] + */ +async function classifySite(url, RS = RemoteSettings) { + let category = "other"; + let parsedURL; + + // Try to parse the url. + for (let _url of [url, `https://${url}`]) { + try { + parsedURL = new URL(_url); + break; + } catch (e) {} + } + + if (parsedURL) { + // If we parsed successfully, find a match. + const hostname = parsedURL.hostname.replace(/^www\./i, ""); + const params = parsedURL.searchParams; + // NOTE: there will be an initial/default local copy of the data in m-c. + // Therefore, this should never return an empty list []. + const siteTypes = await RS("sites-classification").get(); + const sortedSiteTypes = siteTypes.sort( + (x, y) => (y.weight || 0) - (x.weight || 0) + ); + for (let type of sortedSiteTypes) { + for (let criteria of type.criteria) { + if (criteria.url && criteria.url !== url) { + continue; + } + if (criteria.hostname && criteria.hostname !== hostname) { + continue; + } + if (criteria.sld && criteria.sld !== hostname.split(".")[0]) { + continue; + } + if (criteria.params && !_hasParams(criteria.params, params)) { + continue; + } + return type.type; + } + } + } + return category; +} + +const EXPORTED_SYMBOLS = ["classifySite"]; -- cgit v1.2.3