diff options
Diffstat (limited to 'browser/components/doh/TRRPerformance.jsm')
-rw-r--r-- | browser/components/doh/TRRPerformance.jsm | 404 |
1 files changed, 404 insertions, 0 deletions
diff --git a/browser/components/doh/TRRPerformance.jsm b/browser/components/doh/TRRPerformance.jsm new file mode 100644 index 0000000000..8530301fbc --- /dev/null +++ b/browser/components/doh/TRRPerformance.jsm @@ -0,0 +1,404 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +/* + * This module tests TRR performance by issuing DNS requests to TRRs and + * recording telemetry for the network time for each request. + * + * We test each TRR with 5 random subdomains of a canonical domain and also + * a "popular" domain (which the TRR likely have cached). + * + * To ensure data integrity, we run the requests in an aggregator wrapper + * and collect all the results before sending telemetry. If we detect network + * loss, the results are discarded. A new run is triggered upon detection of + * usable network until a full set of results has been captured. We stop retrying + * after 5 attempts. + */ +var EXPORTED_SYMBOLS = ["TRRRacer", "DNSLookup", "LookupAggregator"]; + +Services.telemetry.setEventRecordingEnabled( + "security.doh.trrPerformance", + true +); + +const { XPCOMUtils } = ChromeUtils.importESModule( + "resource://gre/modules/XPCOMUtils.sys.mjs" +); + +const lazy = {}; + +XPCOMUtils.defineLazyServiceGetter( + lazy, + "gNetworkLinkService", + "@mozilla.org/network/network-link-service;1", + "nsINetworkLinkService" +); + +XPCOMUtils.defineLazyServiceGetter( + lazy, + "gCaptivePortalService", + "@mozilla.org/network/captive-portal-service;1", + "nsICaptivePortalService" +); + +// The canonical domain whose subdomains we will be resolving. +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "kCanonicalDomain", + "doh-rollout.trrRace.canonicalDomain", + "firefox-dns-perf-test.net." +); + +// The number of random subdomains to resolve per TRR. +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "kRepeats", + "doh-rollout.trrRace.randomSubdomainCount", + 5 +); + +// The "popular" domain that we expect the TRRs to have cached. +XPCOMUtils.defineLazyPreferenceGetter( + lazy, + "kPopularDomains", + "doh-rollout.trrRace.popularDomains", + null, + null, + val => + val + ? val.split(",").map(t => t.trim()) + : [ + "google.com.", + "youtube.com.", + "amazon.com.", + "facebook.com.", + "yahoo.com.", + ] +); + +function getRandomSubdomain() { + let uuid = Services.uuid + .generateUUID() + .toString() + .slice(1, -1); // Discard surrounding braces + return `${uuid}.${lazy.kCanonicalDomain}`; +} + +// A wrapper around async DNS lookups. The results are passed on to the supplied +// callback. The wrapper attempts the lookup 3 times before passing on a failure. +// If a false-y `domain` is supplied, a random subdomain will be used. Each retry +// will use a different random subdomain to ensure we bypass chached responses. +class DNSLookup { + constructor(domain, trrServer, callback) { + this._domain = domain; + this.trrServer = trrServer; + this.callback = callback; + this.retryCount = 0; + } + + doLookup() { + this.retryCount++; + try { + this.usedDomain = this._domain || getRandomSubdomain(); + Services.dns.asyncResolve( + this.usedDomain, + Ci.nsIDNSService.RESOLVE_TYPE_DEFAULT, + Ci.nsIDNSService.RESOLVE_BYPASS_CACHE, + Services.dns.newAdditionalInfo(this.trrServer, -1), + this, + Services.tm.currentThread, + {} + ); + } catch (e) { + Cu.reportError(e); + } + } + + onLookupComplete(request, record, status) { + // Try again if we failed... + if (!Components.isSuccessCode(status) && this.retryCount < 3) { + this.doLookup(); + return; + } + + // But after the third try, just pass the status on. + this.callback(request, record, status, this.usedDomain, this.retryCount); + } +} + +DNSLookup.prototype.QueryInterface = ChromeUtils.generateQI(["nsIDNSListener"]); + +// A wrapper around a single set of measurements. The required lookups are +// triggered and the results aggregated before telemetry is sent. If aborted, +// any aggregated results are discarded. +class LookupAggregator { + constructor(onCompleteCallback, trrList) { + this.onCompleteCallback = onCompleteCallback; + this.trrList = trrList; + this.aborted = false; + this.networkUnstable = false; + this.captivePortal = false; + + this.domains = []; + for (let i = 0; i < lazy.kRepeats; ++i) { + // false-y domain will cause DNSLookup to generate a random one. + this.domains.push(null); + } + this.domains.push(...lazy.kPopularDomains); + this.totalLookups = this.trrList.length * this.domains.length; + this.completedLookups = 0; + this.results = []; + } + + run() { + if (this._ran || this._aborted) { + Cu.reportError("Trying to re-run a LookupAggregator."); + return; + } + + this._ran = true; + for (let trr of this.trrList) { + for (let domain of this.domains) { + new DNSLookup( + domain, + trr, + (request, record, status, usedDomain, retryCount) => { + this.results.push({ + domain: usedDomain, + trr, + status, + time: record + ? record.QueryInterface(Ci.nsIDNSAddrRecord) + .trrFetchDurationNetworkOnly + : -1, + retryCount, + }); + + this.completedLookups++; + if (this.completedLookups == this.totalLookups) { + this.recordResults(); + } + } + ).doLookup(); + } + } + } + + abort() { + this.aborted = true; + } + + markUnstableNetwork() { + this.networkUnstable = true; + } + + markCaptivePortal() { + this.captivePortal = true; + } + + recordResults() { + if (this.aborted) { + return; + } + + for (let { domain, trr, status, time, retryCount } of this.results) { + if ( + !( + lazy.kPopularDomains.includes(domain) || + domain.includes(lazy.kCanonicalDomain) + ) + ) { + Cu.reportError("Expected known domain for reporting, got " + domain); + return; + } + + Services.telemetry.recordEvent( + "security.doh.trrPerformance", + "resolved", + "record", + "success", + { + domain, + trr, + status: status.toString(), + time: time.toString(), + retryCount: retryCount.toString(), + networkUnstable: this.networkUnstable.toString(), + captivePortal: this.captivePortal.toString(), + } + ); + } + + this.onCompleteCallback(); + } +} + +// This class monitors the network and spawns a new LookupAggregator when ready. +// When the network goes down, an ongoing aggregator is aborted and a new one +// spawned next time we get a link, up to 5 times. On the fifth time, we just +// let the aggegator complete and mark it as tainted. +class TRRRacer { + constructor(onCompleteCallback, trrList) { + this._aggregator = null; + this._retryCount = 0; + this._complete = false; + this._onCompleteCallback = onCompleteCallback; + this._trrList = trrList; + } + + run() { + if ( + lazy.gNetworkLinkService.isLinkUp && + lazy.gCaptivePortalService.state != + lazy.gCaptivePortalService.LOCKED_PORTAL + ) { + this._runNewAggregator(); + if ( + lazy.gCaptivePortalService.state == + lazy.gCaptivePortalService.UNLOCKED_PORTAL + ) { + this._aggregator.markCaptivePortal(); + } + } + + Services.obs.addObserver(this, "ipc:network:captive-portal-set-state"); + Services.obs.addObserver(this, "network:link-status-changed"); + } + + onComplete() { + Services.obs.removeObserver(this, "ipc:network:captive-portal-set-state"); + Services.obs.removeObserver(this, "network:link-status-changed"); + + this._complete = true; + + if (this._onCompleteCallback) { + this._onCompleteCallback(); + } + } + + getFastestTRR(returnRandomDefault = false) { + if (!this._complete) { + throw new Error("getFastestTRR: Measurement still running."); + } + + return this._getFastestTRRFromResults( + this._aggregator.results, + returnRandomDefault + ); + } + + /* + * Given an array of { trr, time }, returns the trr with smallest mean time. + * Separate from _getFastestTRR for easy unit-testing. + * + * @returns The TRR with the fastest average time. + * If returnRandomDefault is false-y, returns undefined if no valid + * times were present in the results. Otherwise, returns one of the + * present TRRs at random. + */ + _getFastestTRRFromResults(results, returnRandomDefault = false) { + // First, organize the results into a map of TRR -> array of times + let TRRTimingMap = new Map(); + let TRRErrorCount = new Map(); + for (let { trr, time } of results) { + if (!TRRTimingMap.has(trr)) { + TRRTimingMap.set(trr, []); + } + if (time != -1) { + TRRTimingMap.get(trr).push(time); + } else { + TRRErrorCount.set(trr, 1 + (TRRErrorCount.get(trr) || 0)); + } + } + + // Loop through each TRR's array of times, compute the geometric means, + // and remember the fastest TRR. Geometric mean is a bit more forgiving + // in the presence of noise (anomalously high values). + // We don't need the full geometric mean, we simply calculate the arithmetic + // means in log-space and then compare those values. + let fastestTRR; + let fastestAverageTime = -1; + let trrs = [...TRRTimingMap.keys()]; + for (let trr of trrs) { + let times = TRRTimingMap.get(trr); + if (!times.length) { + continue; + } + + // Skip TRRs that had an error rate of more than 30%. + let errorCount = TRRErrorCount.get(trr) || 0; + let totalResults = times.length + errorCount; + if (errorCount / totalResults > 0.3) { + continue; + } + + // Arithmetic mean in log space. Take log of (a + 1) to ensure we never + // take log(0) which would be -Infinity. + let averageTime = + times.map(a => Math.log(a + 1)).reduce((a, b) => a + b) / times.length; + if (fastestAverageTime == -1 || averageTime < fastestAverageTime) { + fastestAverageTime = averageTime; + fastestTRR = trr; + } + } + + if (returnRandomDefault && !fastestTRR) { + fastestTRR = trrs[Math.floor(Math.random() * trrs.length)]; + } + + return fastestTRR; + } + + _runNewAggregator() { + this._aggregator = new LookupAggregator( + () => this.onComplete(), + this._trrList + ); + this._aggregator.run(); + this._retryCount++; + } + + // When the link goes *down*, or when we detect a locked captive portal, we + // abort any ongoing LookupAggregator run. When the link goes *up*, or we + // detect a newly unlocked portal, we start a run if one isn't ongoing. + observe(subject, topic, data) { + switch (topic) { + case "network:link-status-changed": + if (this._aggregator && data == "down") { + if (this._retryCount < 5) { + this._aggregator.abort(); + } else { + this._aggregator.markUnstableNetwork(); + } + } else if ( + data == "up" && + (!this._aggregator || this._aggregator.aborted) + ) { + this._runNewAggregator(); + } + break; + case "ipc:network:captive-portal-set-state": + if ( + this._aggregator && + lazy.gCaptivePortalService.state == + lazy.gCaptivePortalService.LOCKED_PORTAL + ) { + if (this._retryCount < 5) { + this._aggregator.abort(); + } else { + this._aggregator.markCaptivePortal(); + } + } else if ( + lazy.gCaptivePortalService.state == + lazy.gCaptivePortalService.UNLOCKED_PORTAL && + (!this._aggregator || this._aggregator.aborted) + ) { + this._runNewAggregator(); + } + break; + } + } +} |