summaryrefslogtreecommitdiffstats
path: root/browser/components/doh/TRRPerformance.sys.mjs
diff options
context:
space:
mode:
Diffstat (limited to 'browser/components/doh/TRRPerformance.sys.mjs')
-rw-r--r--browser/components/doh/TRRPerformance.sys.mjs395
1 files changed, 395 insertions, 0 deletions
diff --git a/browser/components/doh/TRRPerformance.sys.mjs b/browser/components/doh/TRRPerformance.sys.mjs
new file mode 100644
index 0000000000..e46f280f40
--- /dev/null
+++ b/browser/components/doh/TRRPerformance.sys.mjs
@@ -0,0 +1,395 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This module tests TRR performance by issuing DNS requests to TRRs and
+ * recording telemetry for the network time for each request.
+ *
+ * We test each TRR with 5 random subdomains of a canonical domain and also
+ * a "popular" domain (which the TRR likely have cached).
+ *
+ * To ensure data integrity, we run the requests in an aggregator wrapper
+ * and collect all the results before sending telemetry. If we detect network
+ * loss, the results are discarded. A new run is triggered upon detection of
+ * usable network until a full set of results has been captured. We stop retrying
+ * after 5 attempts.
+ */
+Services.telemetry.setEventRecordingEnabled(
+ "security.doh.trrPerformance",
+ true
+);
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+
+XPCOMUtils.defineLazyServiceGetter(
+ lazy,
+ "gNetworkLinkService",
+ "@mozilla.org/network/network-link-service;1",
+ "nsINetworkLinkService"
+);
+
+XPCOMUtils.defineLazyServiceGetter(
+ lazy,
+ "gCaptivePortalService",
+ "@mozilla.org/network/captive-portal-service;1",
+ "nsICaptivePortalService"
+);
+
+// The canonical domain whose subdomains we will be resolving.
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "kCanonicalDomain",
+ "doh-rollout.trrRace.canonicalDomain",
+ "firefox-dns-perf-test.net."
+);
+
+// The number of random subdomains to resolve per TRR.
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "kRepeats",
+ "doh-rollout.trrRace.randomSubdomainCount",
+ 5
+);
+
+// The "popular" domain that we expect the TRRs to have cached.
+XPCOMUtils.defineLazyPreferenceGetter(
+ lazy,
+ "kPopularDomains",
+ "doh-rollout.trrRace.popularDomains",
+ null,
+ null,
+ val =>
+ val
+ ? val.split(",").map(t => t.trim())
+ : [
+ "google.com.",
+ "youtube.com.",
+ "amazon.com.",
+ "facebook.com.",
+ "yahoo.com.",
+ ]
+);
+
+function getRandomSubdomain() {
+ let uuid = Services.uuid.generateUUID().toString().slice(1, -1); // Discard surrounding braces
+ return `${uuid}.${lazy.kCanonicalDomain}`;
+}
+
+// A wrapper around async DNS lookups. The results are passed on to the supplied
+// callback. The wrapper attempts the lookup 3 times before passing on a failure.
+// If a false-y `domain` is supplied, a random subdomain will be used. Each retry
+// will use a different random subdomain to ensure we bypass chached responses.
+export class DNSLookup {
+ constructor(domain, trrServer, callback) {
+ this._domain = domain;
+ this.trrServer = trrServer;
+ this.callback = callback;
+ this.retryCount = 0;
+ }
+
+ doLookup() {
+ this.retryCount++;
+ try {
+ this.usedDomain = this._domain || getRandomSubdomain();
+ Services.dns.asyncResolve(
+ this.usedDomain,
+ Ci.nsIDNSService.RESOLVE_TYPE_DEFAULT,
+ Ci.nsIDNSService.RESOLVE_BYPASS_CACHE,
+ Services.dns.newAdditionalInfo(this.trrServer, -1),
+ this,
+ Services.tm.currentThread,
+ {}
+ );
+ } catch (e) {
+ console.error(e);
+ }
+ }
+
+ onLookupComplete(request, record, status) {
+ // Try again if we failed...
+ if (!Components.isSuccessCode(status) && this.retryCount < 3) {
+ this.doLookup();
+ return;
+ }
+
+ // But after the third try, just pass the status on.
+ this.callback(request, record, status, this.usedDomain, this.retryCount);
+ }
+}
+
+DNSLookup.prototype.QueryInterface = ChromeUtils.generateQI(["nsIDNSListener"]);
+
+// A wrapper around a single set of measurements. The required lookups are
+// triggered and the results aggregated before telemetry is sent. If aborted,
+// any aggregated results are discarded.
+export class LookupAggregator {
+ constructor(onCompleteCallback, trrList) {
+ this.onCompleteCallback = onCompleteCallback;
+ this.trrList = trrList;
+ this.aborted = false;
+ this.networkUnstable = false;
+ this.captivePortal = false;
+
+ this.domains = [];
+ for (let i = 0; i < lazy.kRepeats; ++i) {
+ // false-y domain will cause DNSLookup to generate a random one.
+ this.domains.push(null);
+ }
+ this.domains.push(...lazy.kPopularDomains);
+ this.totalLookups = this.trrList.length * this.domains.length;
+ this.completedLookups = 0;
+ this.results = [];
+ }
+
+ run() {
+ if (this._ran || this._aborted) {
+ console.error("Trying to re-run a LookupAggregator.");
+ return;
+ }
+
+ this._ran = true;
+ for (let trr of this.trrList) {
+ for (let domain of this.domains) {
+ new DNSLookup(
+ domain,
+ trr,
+ (request, record, status, usedDomain, retryCount) => {
+ this.results.push({
+ domain: usedDomain,
+ trr,
+ status,
+ time: record
+ ? record.QueryInterface(Ci.nsIDNSAddrRecord)
+ .trrFetchDurationNetworkOnly
+ : -1,
+ retryCount,
+ });
+
+ this.completedLookups++;
+ if (this.completedLookups == this.totalLookups) {
+ this.recordResults();
+ }
+ }
+ ).doLookup();
+ }
+ }
+ }
+
+ abort() {
+ this.aborted = true;
+ }
+
+ markUnstableNetwork() {
+ this.networkUnstable = true;
+ }
+
+ markCaptivePortal() {
+ this.captivePortal = true;
+ }
+
+ recordResults() {
+ if (this.aborted) {
+ return;
+ }
+
+ for (let { domain, trr, status, time, retryCount } of this.results) {
+ if (
+ !(
+ lazy.kPopularDomains.includes(domain) ||
+ domain.includes(lazy.kCanonicalDomain)
+ )
+ ) {
+ console.error("Expected known domain for reporting, got ", domain);
+ return;
+ }
+
+ Services.telemetry.recordEvent(
+ "security.doh.trrPerformance",
+ "resolved",
+ "record",
+ "success",
+ {
+ domain,
+ trr,
+ status: status.toString(),
+ time: time.toString(),
+ retryCount: retryCount.toString(),
+ networkUnstable: this.networkUnstable.toString(),
+ captivePortal: this.captivePortal.toString(),
+ }
+ );
+ }
+
+ this.onCompleteCallback();
+ }
+}
+
+// This class monitors the network and spawns a new LookupAggregator when ready.
+// When the network goes down, an ongoing aggregator is aborted and a new one
+// spawned next time we get a link, up to 5 times. On the fifth time, we just
+// let the aggegator complete and mark it as tainted.
+export class TRRRacer {
+ constructor(onCompleteCallback, trrList) {
+ this._aggregator = null;
+ this._retryCount = 0;
+ this._complete = false;
+ this._onCompleteCallback = onCompleteCallback;
+ this._trrList = trrList;
+ }
+
+ run() {
+ if (
+ lazy.gNetworkLinkService.isLinkUp &&
+ lazy.gCaptivePortalService.state !=
+ lazy.gCaptivePortalService.LOCKED_PORTAL
+ ) {
+ this._runNewAggregator();
+ if (
+ lazy.gCaptivePortalService.state ==
+ lazy.gCaptivePortalService.UNLOCKED_PORTAL
+ ) {
+ this._aggregator.markCaptivePortal();
+ }
+ }
+
+ Services.obs.addObserver(this, "ipc:network:captive-portal-set-state");
+ Services.obs.addObserver(this, "network:link-status-changed");
+ }
+
+ onComplete() {
+ Services.obs.removeObserver(this, "ipc:network:captive-portal-set-state");
+ Services.obs.removeObserver(this, "network:link-status-changed");
+
+ this._complete = true;
+
+ if (this._onCompleteCallback) {
+ this._onCompleteCallback();
+ }
+ }
+
+ getFastestTRR(returnRandomDefault = false) {
+ if (!this._complete) {
+ throw new Error("getFastestTRR: Measurement still running.");
+ }
+
+ return this._getFastestTRRFromResults(
+ this._aggregator.results,
+ returnRandomDefault
+ );
+ }
+
+ /*
+ * Given an array of { trr, time }, returns the trr with smallest mean time.
+ * Separate from _getFastestTRR for easy unit-testing.
+ *
+ * @returns The TRR with the fastest average time.
+ * If returnRandomDefault is false-y, returns undefined if no valid
+ * times were present in the results. Otherwise, returns one of the
+ * present TRRs at random.
+ */
+ _getFastestTRRFromResults(results, returnRandomDefault = false) {
+ // First, organize the results into a map of TRR -> array of times
+ let TRRTimingMap = new Map();
+ let TRRErrorCount = new Map();
+ for (let { trr, time } of results) {
+ if (!TRRTimingMap.has(trr)) {
+ TRRTimingMap.set(trr, []);
+ }
+ if (time != -1) {
+ TRRTimingMap.get(trr).push(time);
+ } else {
+ TRRErrorCount.set(trr, 1 + (TRRErrorCount.get(trr) || 0));
+ }
+ }
+
+ // Loop through each TRR's array of times, compute the geometric means,
+ // and remember the fastest TRR. Geometric mean is a bit more forgiving
+ // in the presence of noise (anomalously high values).
+ // We don't need the full geometric mean, we simply calculate the arithmetic
+ // means in log-space and then compare those values.
+ let fastestTRR;
+ let fastestAverageTime = -1;
+ let trrs = [...TRRTimingMap.keys()];
+ for (let trr of trrs) {
+ let times = TRRTimingMap.get(trr);
+ if (!times.length) {
+ continue;
+ }
+
+ // Skip TRRs that had an error rate of more than 30%.
+ let errorCount = TRRErrorCount.get(trr) || 0;
+ let totalResults = times.length + errorCount;
+ if (errorCount / totalResults > 0.3) {
+ continue;
+ }
+
+ // Arithmetic mean in log space. Take log of (a + 1) to ensure we never
+ // take log(0) which would be -Infinity.
+ let averageTime =
+ times.map(a => Math.log(a + 1)).reduce((a, b) => a + b) / times.length;
+ if (fastestAverageTime == -1 || averageTime < fastestAverageTime) {
+ fastestAverageTime = averageTime;
+ fastestTRR = trr;
+ }
+ }
+
+ if (returnRandomDefault && !fastestTRR) {
+ fastestTRR = trrs[Math.floor(Math.random() * trrs.length)];
+ }
+
+ return fastestTRR;
+ }
+
+ _runNewAggregator() {
+ this._aggregator = new LookupAggregator(
+ () => this.onComplete(),
+ this._trrList
+ );
+ this._aggregator.run();
+ this._retryCount++;
+ }
+
+ // When the link goes *down*, or when we detect a locked captive portal, we
+ // abort any ongoing LookupAggregator run. When the link goes *up*, or we
+ // detect a newly unlocked portal, we start a run if one isn't ongoing.
+ observe(subject, topic, data) {
+ switch (topic) {
+ case "network:link-status-changed":
+ if (this._aggregator && data == "down") {
+ if (this._retryCount < 5) {
+ this._aggregator.abort();
+ } else {
+ this._aggregator.markUnstableNetwork();
+ }
+ } else if (
+ data == "up" &&
+ (!this._aggregator || this._aggregator.aborted)
+ ) {
+ this._runNewAggregator();
+ }
+ break;
+ case "ipc:network:captive-portal-set-state":
+ if (
+ this._aggregator &&
+ lazy.gCaptivePortalService.state ==
+ lazy.gCaptivePortalService.LOCKED_PORTAL
+ ) {
+ if (this._retryCount < 5) {
+ this._aggregator.abort();
+ } else {
+ this._aggregator.markCaptivePortal();
+ }
+ } else if (
+ lazy.gCaptivePortalService.state ==
+ lazy.gCaptivePortalService.UNLOCKED_PORTAL &&
+ (!this._aggregator || this._aggregator.aborted)
+ ) {
+ this._runNewAggregator();
+ }
+ break;
+ }
+ }
+}