summaryrefslogtreecommitdiffstats
path: root/browser/components/newtab/lib/SiteClassifier.jsm
blob: 0b174676b95f47c360a93886783dd6e3bf30c35f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";

const { RemoteSettings } = ChromeUtils.importESModule(
  "resource://services-settings/remote-settings.sys.mjs"
);

// Returns whether the passed in params match the criteria.
// To match, they must contain all the params specified in criteria and the values
// must match if a value is provided in criteria.
function _hasParams(criteria, params) {
  for (let param of criteria) {
    const val = params.get(param.key);
    if (
      val === null ||
      (param.value && param.value !== val) ||
      (param.prefix && !val.startsWith(param.prefix))
    ) {
      return false;
    }
  }
  return true;
}

/**
 * classifySite
 * Classifies a given URL into a category based on classification data from RemoteSettings.
 * The data from remote settings can match a category by one of the following:
 *  - match the exact URL
 *  - match the hostname or second level domain (sld)
 *  - match query parameter(s), and optionally their values or prefixes
 *  - match both (hostname or sld) and query parameter(s)
 *
 * The data looks like:
 * [{
 *    "type": "hostname-and-params-match",
 *    "criteria": [
 *      {
 *        "url": "https://matchurl.com",
 *        "hostname": "matchhostname.com",
 *        "sld": "secondleveldomain",
 *        "params": [
 *          {
 *            "key": "matchparam",
 *            "value": "matchvalue",
 *            "prefix": "matchpPrefix",
 *          },
 *        ],
 *      },
 *    ],
 *    "weight": 300,
 *  },...]
 */
async function classifySite(url, RS = RemoteSettings) {
  let category = "other";
  let parsedURL;

  // Try to parse the url.
  for (let _url of [url, `https://${url}`]) {
    try {
      parsedURL = new URL(_url);
      break;
    } catch (e) {}
  }

  if (parsedURL) {
    // If we parsed successfully, find a match.
    const hostname = parsedURL.hostname.replace(/^www\./i, "");
    const params = parsedURL.searchParams;
    // NOTE: there will be an initial/default local copy of the data in m-c.
    // Therefore, this should never return an empty list [].
    const siteTypes = await RS("sites-classification").get();
    const sortedSiteTypes = siteTypes.sort(
      (x, y) => (y.weight || 0) - (x.weight || 0)
    );
    for (let type of sortedSiteTypes) {
      for (let criteria of type.criteria) {
        if (criteria.url && criteria.url !== url) {
          continue;
        }
        if (criteria.hostname && criteria.hostname !== hostname) {
          continue;
        }
        if (criteria.sld && criteria.sld !== hostname.split(".")[0]) {
          continue;
        }
        if (criteria.params && !_hasParams(criteria.params, params)) {
          continue;
        }
        return type.type;
      }
    }
  }
  return category;
}

const EXPORTED_SYMBOLS = ["classifySite"];