diff options
Diffstat (limited to 'services/sync/modules/telemetry.sys.mjs')
-rw-r--r-- | services/sync/modules/telemetry.sys.mjs | 1313 |
1 files changed, 1313 insertions, 0 deletions
diff --git a/services/sync/modules/telemetry.sys.mjs b/services/sync/modules/telemetry.sys.mjs new file mode 100644 index 0000000000..9a12be9714 --- /dev/null +++ b/services/sync/modules/telemetry.sys.mjs @@ -0,0 +1,1313 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// Support for Sync-and-FxA-related telemetry, which is submitted in a special-purpose +// telemetry ping called the "sync ping", documented here: +// +// ../../../toolkit/components/telemetry/docs/data/sync-ping.rst +// +// The sync ping contains identifiers that are linked to the user's Firefox Account +// and are separate from the main telemetry client_id, so this file is also responsible +// for ensuring that we can delete those pings upon user request, by plumbing its +// identifiers into the "deletion-request" ping. + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +import { Log } from "resource://gre/modules/Log.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + Async: "resource://services-common/async.sys.mjs", + AuthenticationError: "resource://services-sync/sync_auth.sys.mjs", + FxAccounts: "resource://gre/modules/FxAccounts.sys.mjs", + Observers: "resource://services-common/observers.sys.mjs", + Resource: "resource://services-sync/resource.sys.mjs", + Status: "resource://services-sync/status.sys.mjs", + Svc: "resource://services-sync/util.sys.mjs", + TelemetryController: "resource://gre/modules/TelemetryController.sys.mjs", + TelemetryEnvironment: "resource://gre/modules/TelemetryEnvironment.sys.mjs", + TelemetryUtils: "resource://gre/modules/TelemetryUtils.sys.mjs", + Weave: "resource://services-sync/main.sys.mjs", +}); + +XPCOMUtils.defineLazyModuleGetters(lazy, { + ObjectUtils: "resource://gre/modules/ObjectUtils.jsm", +}); + +XPCOMUtils.defineLazyGetter(lazy, "fxAccounts", () => { + return ChromeUtils.importESModule( + "resource://gre/modules/FxAccounts.sys.mjs" + ).getFxAccountsSingleton(); +}); + +import * as constants from "resource://services-sync/constants.sys.mjs"; + +XPCOMUtils.defineLazyGetter( + lazy, + "WeaveService", + () => Cc["@mozilla.org/weave/service;1"].getService().wrappedJSObject +); +const log = Log.repository.getLogger("Sync.Telemetry"); + +const TOPICS = [ + // For tracking change to account/device identifiers. + "fxaccounts:new_device_id", + "fxaccounts:onlogout", + "weave:service:ready", + "weave:service:login:got-hashed-id", + + // For whole-of-sync metrics. + "weave:service:sync:start", + "weave:service:sync:finish", + "weave:service:sync:error", + + // For individual engine metrics. + "weave:engine:sync:start", + "weave:engine:sync:finish", + "weave:engine:sync:error", + "weave:engine:sync:applied", + "weave:engine:sync:step", + "weave:engine:sync:uploaded", + "weave:engine:validate:finish", + "weave:engine:validate:error", + + // For ad-hoc telemetry events. + "weave:telemetry:event", + "weave:telemetry:histogram", + "fxa:telemetry:event", + + "weave:telemetry:migration", +]; + +const PING_FORMAT_VERSION = 1; + +const EMPTY_UID = "0".repeat(32); + +// The set of engines we record telemetry for - any other engines are ignored. +const ENGINES = new Set([ + "addons", + "bookmarks", + "clients", + "forms", + "history", + "passwords", + "prefs", + "tabs", + "extension-storage", + "addresses", + "creditcards", +]); + +function tryGetMonotonicTimestamp() { + try { + return Services.telemetry.msSinceProcessStart(); + } catch (e) { + log.warn("Unable to get a monotonic timestamp!"); + return -1; + } +} + +function timeDeltaFrom(monotonicStartTime) { + let now = tryGetMonotonicTimestamp(); + if (monotonicStartTime !== -1 && now !== -1) { + return Math.round(now - monotonicStartTime); + } + return -1; +} + +const NS_ERROR_MODULE_BASE_OFFSET = 0x45; +const NS_ERROR_MODULE_NETWORK = 6; + +// A reimplementation of NS_ERROR_GET_MODULE, which surprisingly doesn't seem +// to exist anywhere in .js code in a way that can be reused. +// This is taken from DownloadCore.sys.mjs. +function NS_ERROR_GET_MODULE(code) { + return ((code & 0x7fff0000) >> 16) - NS_ERROR_MODULE_BASE_OFFSET; +} + +// Converts extra integer fields to strings, rounds floats to three +// decimal places (nanosecond precision for timings), and removes profile +// directory paths and URLs from potential error messages. +function normalizeExtraTelemetryFields(extra) { + let result = {}; + for (let key in extra) { + let value = extra[key]; + let type = typeof value; + if (type == "string") { + result[key] = ErrorSanitizer.cleanErrorMessage(value); + } else if (type == "number") { + result[key] = Number.isInteger(value) + ? value.toString(10) + : value.toFixed(3); + } else if (type != "undefined") { + throw new TypeError( + `Invalid type ${type} for extra telemetry field ${key}` + ); + } + } + return lazy.ObjectUtils.isEmpty(result) ? undefined : result; +} + +// Keps track of the counts of individual records fate during a sync cycle +// The main reason this is a class is to keep track of reasons individual records +// failure reasons without huge memory overhead. +export class SyncedRecordsTelemetry { + // applied => number of items that should be applied. + // failed => number of items that failed in this sync. + // newFailed => number of items that failed for the first time in this sync. + // reconciled => number of items that were reconciled. + // failedReasons => {name, count} of reasons a record failed + incomingCounts = { + applied: 0, + failed: 0, + newFailed: 0, + reconciled: 0, + failedReasons: null, + }; + outgoingCounts = { failed: 0, sent: 0, failedReasons: null }; + + addIncomingFailedReason(reason) { + if (!this.incomingCounts.failedReasons) { + this.incomingCounts.failedReasons = []; + } + let transformedReason = SyncTelemetry.transformError(reason); + // Some errors like http/nss errors don't have an error object + // those will be caught by the higher level telemetry + if (!transformedReason.error) { + return; + } + + let index = this.incomingCounts.failedReasons.findIndex( + reasons => reasons.name === transformedReason.error + ); + + if (index >= 0) { + this.incomingCounts.failedReasons[index].count += 1; + } else { + this.incomingCounts.failedReasons.push({ + name: transformedReason.error, + count: 1, + }); + } + } + + addOutgoingFailedReason(reason) { + if (!this.outgoingCounts.failedReasons) { + this.outgoingCounts.failedReasons = []; + } + let transformedReason = SyncTelemetry.transformError(reason); + // Some errors like http/nss errors don't have an error object + // those will be caught by the higher level telemetry + if (!transformedReason.error) { + return; + } + let index = this.outgoingCounts.failedReasons.findIndex( + reasons => reasons.name === transformedReason.error + ); + if (index >= 0) { + this.outgoingCounts.failedReasons[index].count += 1; + } else { + this.outgoingCounts.failedReasons.push({ + name: transformedReason.error, + count: 1, + }); + } + } +} + +// The `ErrorSanitizer` has 2 main jobs: +// * Remove PII from errors, such as the profile dir or URLs the user might +// have visited. +// * Normalize errors so different locales or operating systems etc don't +// generate different messages for the same underlying error. +// * [TODO] Normalize errors so environmental factors don't influence message. +// For example, timestamps or GUIDs should be replaced with something static. +export class ErrorSanitizer { + // Things we normalize - this isn't exhaustive, but covers the common error messages we see. + // Eg: + // > Win error 112 during operation write on file [profileDir]\weave\addonsreconciler.json (Espacio en disco insuficiente. ) + // > Win error 112 during operation write on file [profileDir]\weave\addonsreconciler.json (Diskte yeterli yer yok. ) + // > <snip many other translations of the error> + // > Unix error 28 during operation write on file [profileDir]/weave/addonsreconciler.json (No space left on device) + // These tend to crowd out other errors we might care about (eg, top 5 errors for some engines are + // variations of the "no space left on device") + + // Note that only errors that have same-but-different errors on Windows and Unix are here - we + // still sanitize ones that aren't in these maps to remove the translations etc - eg, + // `ERROR_SHARING_VIOLATION` doesn't really have a unix equivalent, so doesn't appear here, but + // we still strip the translations to avoid the variants. + static E_NO_SPACE_ON_DEVICE = "OS error [No space left on device]"; + static E_PERMISSION_DENIED = "OS error [Permission denied]"; + static E_NO_FILE_OR_DIR = "OS error [File/Path not found]"; + static E_NO_MEM = "OS error [No memory]"; + + static WindowsErrorSubstitutions = { + 2: this.E_NO_FILE_OR_DIR, // ERROR_FILE_NOT_FOUND + 3: this.E_NO_FILE_OR_DIR, // ERROR_PATH_NOT_FOUND + 5: this.E_PERMISSION_DENIED, // ERROR_ACCESS_DENIED + 8: this.E_NO_MEM, // ERROR_NOT_ENOUGH_MEMORY + 112: this.E_NO_SPACE_ON_DEVICE, // ERROR_DISK_FULL + }; + + static UnixErrorSubstitutions = { + 2: this.E_NO_FILE_OR_DIR, // ENOENT + 12: this.E_NO_MEM, // ENOMEM + 13: this.E_PERMISSION_DENIED, // EACCESS + 28: this.E_NO_SPACE_ON_DEVICE, // ENOSPC + }; + + static DOMErrorSubstitutions = { + NotFoundError: this.E_NO_FILE_OR_DIR, + NotAllowedError: this.E_PERMISSION_DENIED, + }; + + static reWinError = + /^(?<head>Win error (?<errno>\d+))(?<detail>.*) \(.*\r?\n?\)$/m; + static reUnixError = + /^(?<head>Unix error (?<errno>\d+))(?<detail>.*) \(.*\)$/; + + static #cleanOSErrorMessage(message, error = undefined) { + if (DOMException.isInstance(error)) { + const sub = this.DOMErrorSubstitutions[error.name]; + message = message.replaceAll("\\", "/"); + if (sub) { + return `${sub} ${message}`; + } + } + + let match = this.reWinError.exec(message); + if (match) { + let head = + this.WindowsErrorSubstitutions[match.groups.errno] || match.groups.head; + return head + match.groups.detail.replaceAll("\\", "/"); + } + match = this.reUnixError.exec(message); + if (match) { + let head = + this.UnixErrorSubstitutions[match.groups.errno] || match.groups.head; + return head + match.groups.detail; + } + return message; + } + + // A regex we can use to replace the profile dir in error messages. We use a + // regexp so we can simply replace all case-insensitive occurences. + // This escaping function is from: + // https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions + static reProfileDir = new RegExp( + PathUtils.profileDir.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), + "gi" + ); + + /** + * Clean an error message, removing PII and normalizing OS-specific messages. + * + * @param {string} message The error message + * @param {Error?} error The error class instance, if any. + */ + static cleanErrorMessage(message, error = undefined) { + // There's a chance the profiledir is in the error string which is PII we + // want to avoid including in the ping. + message = message.replace(this.reProfileDir, "[profileDir]"); + // MSG_INVALID_URL from /dom/bindings/Errors.msg -- no way to access this + // directly from JS. + if (message.endsWith("is not a valid URL.")) { + message = "<URL> is not a valid URL."; + } + // Try to filter things that look somewhat like a URL (in that they contain a + // colon in the middle of non-whitespace), in case anything else is including + // these in error messages. Note that JSON.stringified stuff comes through + // here, so we explicitly ignore double-quotes as well. + message = message.replace(/[^\s"]+:[^\s"]+/g, "<URL>"); + + // Anywhere that's normalized the guid in errors we can easily filter + // to make it easier to aggregate these types of errors + message = message.replace(/<guid: ([^>]+)>/g, "<GUID>"); + + return this.#cleanOSErrorMessage(message, error); + } +} + +// This function validates the payload of a telemetry "event" - this can be +// removed once there are APIs available for the telemetry modules to collect +// these events (bug 1329530) - but for now we simulate that planned API as +// best we can. +function validateTelemetryEvent(eventDetails) { + let { object, method, value, extra } = eventDetails; + // Do do basic validation of the params - everything except "extra" must + // be a string. method and object are required. + if ( + typeof method != "string" || + typeof object != "string" || + (value && typeof value != "string") || + (extra && typeof extra != "object") + ) { + log.warn("Invalid event parameters - wrong types", eventDetails); + return false; + } + // length checks. + if ( + method.length > 20 || + object.length > 20 || + (value && value.length > 80) + ) { + log.warn("Invalid event parameters - wrong lengths", eventDetails); + return false; + } + + // extra can be falsey, or an object with string names and values. + if (extra) { + if (Object.keys(extra).length > 10) { + log.warn("Invalid event parameters - too many extra keys", eventDetails); + return false; + } + for (let [ename, evalue] of Object.entries(extra)) { + if ( + typeof ename != "string" || + ename.length > 15 || + typeof evalue != "string" || + evalue.length > 85 + ) { + log.warn( + `Invalid event parameters: extra item "${ename} is invalid`, + eventDetails + ); + return false; + } + } + } + return true; +} + +class EngineRecord { + constructor(name) { + // startTime is in ms from process start, but is monotonic (unlike Date.now()) + // so we need to keep both it and when. + this.startTime = tryGetMonotonicTimestamp(); + this.name = name; + + // This allows cases like bookmarks-buffered to have a separate name from + // the bookmarks engine. + let engineImpl = lazy.Weave.Service.engineManager.get(name); + if (engineImpl && engineImpl.overrideTelemetryName) { + this.overrideTelemetryName = engineImpl.overrideTelemetryName; + } + } + + toJSON() { + let result = { name: this.overrideTelemetryName || this.name }; + let properties = [ + "took", + "status", + "failureReason", + "incoming", + "outgoing", + "validation", + "steps", + ]; + for (let property of properties) { + result[property] = this[property]; + } + return result; + } + + finished(error) { + let took = timeDeltaFrom(this.startTime); + if (took > 0) { + this.took = took; + } + if (error) { + this.failureReason = SyncTelemetry.transformError(error); + } + } + + recordApplied(counts) { + if (this.incoming) { + log.error( + `Incoming records applied multiple times for engine ${this.name}!` + ); + return; + } + if (this.name === "clients" && !counts.failed) { + // ignore successful application of client records + // since otherwise they show up every time and are meaningless. + return; + } + + let incomingData = {}; + + if (counts.failedReasons) { + // sort the failed reasons in desc by count, then take top 10 + counts.failedReasons = counts.failedReasons + .sort((a, b) => b.count - a.count) + .slice(0, 10); + } + // Counts has extra stuff used for logging, but we only care about a few + let properties = ["applied", "failed", "failedReasons"]; + // Only record non-zero properties and only record incoming at all if + // there's at least one property we care about. + for (let property of properties) { + if (counts[property]) { + incomingData[property] = counts[property]; + this.incoming = incomingData; + } + } + } + + recordStep(stepResult) { + let step = { + name: stepResult.name, + }; + if (stepResult.took > 0) { + step.took = Math.round(stepResult.took); + } + if (stepResult.counts) { + let counts = stepResult.counts.filter(({ count }) => count > 0); + if (counts.length) { + step.counts = counts; + } + } + if (this.steps) { + this.steps.push(step); + } else { + this.steps = [step]; + } + } + + recordValidation(validationResult) { + if (this.validation) { + log.error(`Multiple validations occurred for engine ${this.name}!`); + return; + } + let { problems, version, took, checked } = validationResult; + let validation = { + version: version || 0, + checked: checked || 0, + }; + if (took > 0) { + validation.took = Math.round(took); + } + let summarized = problems.filter(({ count }) => count > 0); + if (summarized.length) { + validation.problems = summarized; + } + this.validation = validation; + } + + recordValidationError(e) { + if (this.validation) { + log.error(`Multiple validations occurred for engine ${this.name}!`); + return; + } + + this.validation = { + failureReason: SyncTelemetry.transformError(e), + }; + } + + recordUploaded(counts) { + if (counts.sent || counts.failed) { + if (!this.outgoing) { + this.outgoing = []; + } + if (counts.failedReasons) { + // sort the failed reasons in desc by count, then take top 10 + counts.failedReasons = counts.failedReasons + .sort((a, b) => b.count - a.count) + .slice(0, 10); + } + this.outgoing.push({ + sent: counts.sent || undefined, + failed: counts.failed || undefined, + failedReasons: counts.failedReasons || undefined, + }); + } + } +} + +// The record of a single "sync" - typically many of these are submitted in +// a single ping (ie, as a 'syncs' array) +export class SyncRecord { + constructor(allowedEngines, why) { + this.allowedEngines = allowedEngines; + // Our failure reason. This property only exists in the generated ping if an + // error actually occurred. + this.failureReason = undefined; + this.syncNodeType = null; + this.when = Date.now(); + this.startTime = tryGetMonotonicTimestamp(); + this.took = 0; // will be set later. + this.why = why; + + // All engines that have finished (ie, does not include the "current" one) + // We omit this from the ping if it's empty. + this.engines = []; + // The engine that has started but not yet stopped. + this.currentEngine = null; + } + + toJSON() { + let result = { + when: this.when, + took: this.took, + failureReason: this.failureReason, + status: this.status, + }; + if (this.why) { + result.why = this.why; + } + let engines = []; + for (let engine of this.engines) { + engines.push(engine.toJSON()); + } + if (engines.length) { + result.engines = engines; + } + return result; + } + + finished(error) { + this.took = timeDeltaFrom(this.startTime); + if (this.currentEngine != null) { + log.error( + "Finished called for the sync before the current engine finished" + ); + this.currentEngine.finished(null); + this.onEngineStop(this.currentEngine.name); + } + if (error) { + this.failureReason = SyncTelemetry.transformError(error); + } + + this.syncNodeType = lazy.Weave.Service.identity.telemetryNodeType; + + // Check for engine statuses. -- We do this now, and not in engine.finished + // to make sure any statuses that get set "late" are recorded + for (let engine of this.engines) { + let status = lazy.Status.engines[engine.name]; + if (status && status !== constants.ENGINE_SUCCEEDED) { + engine.status = status; + } + } + + let statusObject = {}; + + let serviceStatus = lazy.Status.service; + if (serviceStatus && serviceStatus !== constants.STATUS_OK) { + statusObject.service = serviceStatus; + this.status = statusObject; + } + let syncStatus = lazy.Status.sync; + if (syncStatus && syncStatus !== constants.SYNC_SUCCEEDED) { + statusObject.sync = syncStatus; + this.status = statusObject; + } + } + + onEngineStart(engineName) { + if (this._shouldIgnoreEngine(engineName, false)) { + return; + } + + if (this.currentEngine) { + log.error( + `Being told that engine ${engineName} has started, but current engine ${this.currentEngine.name} hasn't stopped` + ); + // Just discard the current engine rather than making up data for it. + } + this.currentEngine = new EngineRecord(engineName); + } + + onEngineStop(engineName, error) { + // We only care if it's the current engine if we have a current engine. + if (this._shouldIgnoreEngine(engineName, !!this.currentEngine)) { + return; + } + if (!this.currentEngine) { + // It's possible for us to get an error before the start message of an engine + // (somehow), in which case we still want to record that error. + if (!error) { + return; + } + log.error( + `Error triggered on ${engineName} when no current engine exists: ${error}` + ); + this.currentEngine = new EngineRecord(engineName); + } + this.currentEngine.finished(error); + this.engines.push(this.currentEngine); + this.currentEngine = null; + } + + onEngineApplied(engineName, counts) { + if (this._shouldIgnoreEngine(engineName)) { + return; + } + this.currentEngine.recordApplied(counts); + } + + onEngineStep(engineName, step) { + if (this._shouldIgnoreEngine(engineName)) { + return; + } + this.currentEngine.recordStep(step); + } + + onEngineValidated(engineName, validationData) { + if (this._shouldIgnoreEngine(engineName, false)) { + return; + } + let engine = this.engines.find(e => e.name === engineName); + if ( + !engine && + this.currentEngine && + engineName === this.currentEngine.name + ) { + engine = this.currentEngine; + } + if (engine) { + engine.recordValidation(validationData); + } else { + log.warn( + `Validation event triggered for engine ${engineName}, which hasn't been synced!` + ); + } + } + + onEngineValidateError(engineName, error) { + if (this._shouldIgnoreEngine(engineName, false)) { + return; + } + let engine = this.engines.find(e => e.name === engineName); + if ( + !engine && + this.currentEngine && + engineName === this.currentEngine.name + ) { + engine = this.currentEngine; + } + if (engine) { + engine.recordValidationError(error); + } else { + log.warn( + `Validation failure event triggered for engine ${engineName}, which hasn't been synced!` + ); + } + } + + onEngineUploaded(engineName, counts) { + if (this._shouldIgnoreEngine(engineName)) { + return; + } + this.currentEngine.recordUploaded(counts); + } + + _shouldIgnoreEngine(engineName, shouldBeCurrent = true) { + if (!this.allowedEngines.has(engineName)) { + log.info( + `Notification for engine ${engineName}, but we aren't recording telemetry for it` + ); + return true; + } + if (shouldBeCurrent) { + if (!this.currentEngine || engineName != this.currentEngine.name) { + log.info(`Notification for engine ${engineName} but it isn't current`); + return true; + } + } + return false; + } +} + +// The entire "sync ping" - it includes all the syncs, events etc recorded in +// the ping. +class SyncTelemetryImpl { + constructor(allowedEngines) { + log.manageLevelFromPref("services.sync.log.logger.telemetry"); + // This is accessible so we can enable custom engines during tests. + this.allowedEngines = allowedEngines; + this.current = null; + this.setupObservers(); + + this.payloads = []; + this.discarded = 0; + this.events = []; + this.histograms = {}; + this.migrations = []; + this.maxEventsCount = lazy.Svc.Prefs.get("telemetry.maxEventsCount", 1000); + this.maxPayloadCount = lazy.Svc.Prefs.get("telemetry.maxPayloadCount"); + this.submissionInterval = + lazy.Svc.Prefs.get("telemetry.submissionInterval") * 1000; + this.lastSubmissionTime = Services.telemetry.msSinceProcessStart(); + this.lastUID = EMPTY_UID; + this.lastSyncNodeType = null; + this.currentSyncNodeType = null; + // Note that the sessionStartDate is somewhat arbitrary - the telemetry + // modules themselves just use `new Date()`. This means that our startDate + // isn't going to be the same as the sessionStartDate in the main pings, + // but that's OK for now - if it's a problem we'd need to change the + // telemetry modules to expose what it thinks the sessionStartDate is. + let sessionStartDate = new Date(); + this.sessionStartDate = lazy.TelemetryUtils.toLocalTimeISOString( + lazy.TelemetryUtils.truncateToHours(sessionStartDate) + ); + lazy.TelemetryController.registerSyncPingShutdown(() => this.shutdown()); + } + + sanitizeFxaDeviceId(deviceId) { + return lazy.fxAccounts.telemetry.sanitizeDeviceId(deviceId); + } + + prepareFxaDevices(devices) { + // For non-sync users, the data per device is limited -- just an id and a + // type (and not even the id yet). For sync users, if we can correctly map + // the fxaDevice to a sync device, then we can get os and version info, + // which would be quite unfortunate to lose. + let extraInfoMap = new Map(); + if (this.syncIsEnabled()) { + for (let client of this.getClientsEngineRecords()) { + if (client.fxaDeviceId) { + extraInfoMap.set(client.fxaDeviceId, { + os: client.os, + version: client.version, + syncID: this.sanitizeFxaDeviceId(client.id), + }); + } + } + } + // Finally, sanitize and convert to the proper format. + return devices.map(d => { + let { os, version, syncID } = extraInfoMap.get(d.id) || { + os: undefined, + version: undefined, + syncID: undefined, + }; + return { + id: this.sanitizeFxaDeviceId(d.id) || EMPTY_UID, + type: d.type, + os, + version, + syncID, + }; + }); + } + + syncIsEnabled() { + return lazy.WeaveService.enabled && lazy.WeaveService.ready; + } + + // Separate for testing. + getClientsEngineRecords() { + if (!this.syncIsEnabled()) { + throw new Error("Bug: syncIsEnabled() must be true, check it first"); + } + return lazy.Weave.Service.clientsEngine.remoteClients; + } + + updateFxaDevices(devices) { + if (!devices) { + return {}; + } + let me = devices.find(d => d.isCurrentDevice); + let id = me ? this.sanitizeFxaDeviceId(me.id) : undefined; + let cleanDevices = this.prepareFxaDevices(devices); + return { deviceID: id, devices: cleanDevices }; + } + + getFxaDevices() { + return lazy.fxAccounts.device.recentDeviceList; + } + + getPingJSON(reason) { + let { devices, deviceID } = this.updateFxaDevices(this.getFxaDevices()); + return { + os: lazy.TelemetryEnvironment.currentEnvironment.system.os, + why: reason, + devices, + discarded: this.discarded || undefined, + version: PING_FORMAT_VERSION, + syncs: this.payloads.slice(), + uid: this.lastUID, + syncNodeType: this.lastSyncNodeType || undefined, + deviceID, + sessionStartDate: this.sessionStartDate, + events: !this.events.length ? undefined : this.events, + migrations: !this.migrations.length ? undefined : this.migrations, + histograms: !Object.keys(this.histograms).length + ? undefined + : this.histograms, + }; + } + + _addMigrationRecord(type, info) { + log.debug("Saw telemetry migration info", type, info); + // Updates to this need to be documented in `sync-ping.rst` + switch (type) { + case "webext-storage": + this.migrations.push({ + type: "webext-storage", + entries: +info.entries, + entriesSuccessful: +info.entries_successful, + extensions: +info.extensions, + extensionsSuccessful: +info.extensions_successful, + openFailure: !!info.open_failure, + }); + break; + default: + throw new Error("Bug: Unknown migration record type " + type); + } + } + + finish(reason) { + // Note that we might be in the middle of a sync right now, and so we don't + // want to touch this.current. + let result = this.getPingJSON(reason); + this.payloads = []; + this.discarded = 0; + this.events = []; + this.migrations = []; + this.histograms = {}; + this.submit(result); + } + + setupObservers() { + for (let topic of TOPICS) { + lazy.Observers.add(topic, this, this); + } + } + + shutdown() { + this.finish("shutdown"); + for (let topic of TOPICS) { + lazy.Observers.remove(topic, this, this); + } + } + + submit(record) { + if (!this.isProductionSyncUser()) { + return false; + } + // We still call submit() with possibly illegal payloads so that tests can + // know that the ping was built. We don't end up submitting them, however. + let numEvents = record.events ? record.events.length : 0; + let numMigrations = record.migrations ? record.migrations.length : 0; + if (record.syncs.length || numEvents || numMigrations) { + log.trace( + `submitting ${record.syncs.length} sync record(s) and ` + + `${numEvents} event(s) to telemetry` + ); + lazy.TelemetryController.submitExternalPing("sync", record, { + usePingSender: true, + }).catch(err => { + log.error("failed to submit ping", err); + }); + return true; + } + return false; + } + + isProductionSyncUser() { + // If FxA isn't production then we treat sync as not being production. + // Further, there's the deprecated "services.sync.tokenServerURI" pref we + // need to consider - fxa doesn't consider that as if that's the only + // pref set, they *are* running a production fxa, just not production sync. + if ( + !lazy.FxAccounts.config.isProductionConfig() || + Services.prefs.prefHasUserValue("services.sync.tokenServerURI") + ) { + log.trace(`Not sending telemetry ping for self-hosted Sync user`); + return false; + } + return true; + } + + onSyncStarted(data) { + const why = data && JSON.parse(data).why; + if (this.current) { + log.warn( + "Observed weave:service:sync:start, but we're already recording a sync!" + ); + // Just discard the old record, consistent with our handling of engines, above. + this.current = null; + } + this.current = new SyncRecord(this.allowedEngines, why); + } + + // We need to ensure that the telemetry `deletion-request` ping always contains the user's + // current sync device ID, because if the user opts out of telemetry then the deletion ping + // will be immediately triggered for sending, and we won't have a chance to fill it in later. + // This keeps the `deletion-ping` up-to-date when the user's account state changes. + onAccountInitOrChange() { + // We don't submit sync pings for self-hosters, so don't need to collect their device ids either. + if (!this.isProductionSyncUser()) { + return; + } + // Awkwardly async, but no need to await. If the user's account state changes while + // this promise is in flight, it will reject and we won't record any data in the ping. + // (And a new notification will trigger us to try again with the new state). + lazy.fxAccounts.device + .getLocalId() + .then(deviceId => { + let sanitizedDeviceId = + lazy.fxAccounts.telemetry.sanitizeDeviceId(deviceId); + // In the past we did not persist the FxA metrics identifiers to disk, + // so this might be missing until we can fetch it from the server for the + // first time. There will be a fresh notification tirggered when it's available. + if (sanitizedDeviceId) { + // Sanitized device ids are 64 characters long, but telemetry limits scalar strings to 50. + // The first 32 chars are sufficient to uniquely identify the device, so just send those. + // It's hard to change the sync ping itself to only send 32 chars, to b/w compat reasons. + sanitizedDeviceId = sanitizedDeviceId.substr(0, 32); + Services.telemetry.scalarSet( + "deletion.request.sync_device_id", + sanitizedDeviceId + ); + } + }) + .catch(err => { + log.warn( + `Failed to set sync identifiers in the deletion-request ping: ${err}` + ); + }); + } + + // This keeps the `deletion-request` ping up-to-date when the user signs out, + // clearing the now-nonexistent sync device id. + onAccountLogout() { + Services.telemetry.scalarSet("deletion.request.sync_device_id", ""); + } + + _checkCurrent(topic) { + if (!this.current) { + // This is only `info` because it happens when we do a tabs "quick-write" + log.info( + `Observed notification ${topic} but no current sync is being recorded.` + ); + return false; + } + return true; + } + + _shouldSubmitForDataChange() { + let newID = lazy.fxAccounts.telemetry.getSanitizedUID() || EMPTY_UID; + let oldID = this.lastUID; + if ( + newID != EMPTY_UID && + oldID != EMPTY_UID && + // Both are "real" uids, so we care if they've changed. + newID != oldID + ) { + log.trace( + `shouldSubmitForDataChange - uid from '${oldID}' -> '${newID}'` + ); + return true; + } + // We've gone from knowing one of the ids to not knowing it (which we + // ignore) or we've gone from not knowing it to knowing it (which is fine), + // Now check the node type because a change there also means we should + // submit. + if ( + this.lastSyncNodeType && + this.currentSyncNodeType != this.lastSyncNodeType + ) { + log.trace( + `shouldSubmitForDataChange - nodeType from '${this.lastSyncNodeType}' -> '${this.currentSyncNodeType}'` + ); + return true; + } + log.trace("shouldSubmitForDataChange - no need to submit"); + return false; + } + + maybeSubmitForDataChange() { + if (this._shouldSubmitForDataChange()) { + log.info( + "Early submission of sync telemetry due to changed IDs/NodeType" + ); + this.finish("idchange"); // this actually submits. + this.lastSubmissionTime = Services.telemetry.msSinceProcessStart(); + } + + // Only update the last UIDs if we actually know them. + let current_uid = lazy.fxAccounts.telemetry.getSanitizedUID(); + if (current_uid) { + this.lastUID = current_uid; + } + if (this.currentSyncNodeType) { + this.lastSyncNodeType = this.currentSyncNodeType; + } + } + + maybeSubmitForInterval() { + // We want to submit the ping every `this.submissionInterval` but only when + // there's no current sync in progress, otherwise we may end up submitting + // the sync and the events caused by it in different pings. + if ( + this.current == null && + Services.telemetry.msSinceProcessStart() - this.lastSubmissionTime > + this.submissionInterval + ) { + this.finish("schedule"); + this.lastSubmissionTime = Services.telemetry.msSinceProcessStart(); + } + } + + onSyncFinished(error) { + if (!this.current) { + log.warn("onSyncFinished but we aren't recording"); + return; + } + this.current.finished(error); + this.currentSyncNodeType = this.current.syncNodeType; + let current = this.current; + this.current = null; + this.takeTelemetryRecord(current); + } + + takeTelemetryRecord(record) { + // We check for "data change" before appending the current sync to payloads, + // as it is the current sync which has the data with the new data, and thus + // must go in the *next* submission. + this.maybeSubmitForDataChange(); + if (this.payloads.length < this.maxPayloadCount) { + this.payloads.push(record.toJSON()); + } else { + ++this.discarded; + } + // If we are submitting due to timing, it's desirable that the most recent + // sync is included, so we check after appending the record. + this.maybeSubmitForInterval(); + } + + _addHistogram(hist) { + let histogram = Services.telemetry.getHistogramById(hist); + let s = histogram.snapshot(); + this.histograms[hist] = s; + } + + _recordEvent(eventDetails) { + this.maybeSubmitForDataChange(); + + if (this.events.length >= this.maxEventsCount) { + log.warn("discarding event - already queued our maximum", eventDetails); + return; + } + + let { object, method, value, extra } = eventDetails; + if (extra) { + extra = normalizeExtraTelemetryFields(extra); + eventDetails = { object, method, value, extra }; + } + + if (!validateTelemetryEvent(eventDetails)) { + // we've already logged what the problem is... + return; + } + log.debug("recording event", eventDetails); + + if (extra && lazy.Resource.serverTime && !extra.serverTime) { + extra.serverTime = String(lazy.Resource.serverTime); + } + let category = "sync"; + let ts = Math.floor(tryGetMonotonicTimestamp()); + + // An event record is a simple array with at least 4 items. + let event = [ts, category, method, object]; + // It may have up to 6 elements if |extra| is defined + if (value) { + event.push(value); + if (extra) { + event.push(extra); + } + } else if (extra) { + event.push(null); // a null for the empty value. + event.push(extra); + } + this.events.push(event); + this.maybeSubmitForInterval(); + } + + observe(subject, topic, data) { + log.trace(`observed ${topic} ${data}`); + + switch (topic) { + case "weave:service:ready": + case "weave:service:login:got-hashed-id": + case "fxaccounts:new_device_id": + this.onAccountInitOrChange(); + break; + + case "fxaccounts:onlogout": + this.onAccountLogout(); + break; + + /* sync itself state changes */ + case "weave:service:sync:start": + this.onSyncStarted(data); + break; + + case "weave:service:sync:finish": + if (this._checkCurrent(topic)) { + this.onSyncFinished(null); + } + break; + + case "weave:service:sync:error": + // argument needs to be truthy (this should always be the case) + this.onSyncFinished(subject || "Unknown"); + break; + + /* engine sync state changes */ + case "weave:engine:sync:start": + if (this._checkCurrent(topic)) { + this.current.onEngineStart(data); + } + break; + case "weave:engine:sync:finish": + if (this._checkCurrent(topic)) { + this.current.onEngineStop(data, null); + } + break; + + case "weave:engine:sync:error": + if (this._checkCurrent(topic)) { + // argument needs to be truthy (this should always be the case) + this.current.onEngineStop(data, subject || "Unknown"); + } + break; + + /* engine counts */ + case "weave:engine:sync:applied": + if (this._checkCurrent(topic)) { + this.current.onEngineApplied(data, subject); + } + break; + + case "weave:engine:sync:step": + if (this._checkCurrent(topic)) { + this.current.onEngineStep(data, subject); + } + break; + + case "weave:engine:sync:uploaded": + if (this._checkCurrent(topic)) { + this.current.onEngineUploaded(data, subject); + } + break; + + case "weave:engine:validate:finish": + if (this._checkCurrent(topic)) { + this.current.onEngineValidated(data, subject); + } + break; + + case "weave:engine:validate:error": + if (this._checkCurrent(topic)) { + this.current.onEngineValidateError(data, subject || "Unknown"); + } + break; + + case "weave:telemetry:event": + case "fxa:telemetry:event": + this._recordEvent(subject); + break; + + case "weave:telemetry:histogram": + this._addHistogram(data); + break; + + case "weave:telemetry:migration": + this._addMigrationRecord(data, subject); + break; + + default: + log.warn(`unexpected observer topic ${topic}`); + break; + } + } + + // Transform an exception into a standard description. Exposed here for when + // this module isn't directly responsible for knowing the transform should + // happen (for example, when including an error in the |extra| field of + // event telemetry) + transformError(error) { + // Certain parts of sync will use this pattern as a way to communicate to + // processIncoming to abort the processing. However, there's no guarantee + // this can only happen then. + if (typeof error == "object" && error.code && error.cause) { + error = error.cause; + } + if (lazy.Async.isShutdownException(error)) { + return { name: "shutdownerror" }; + } + + if (typeof error === "string") { + if (error.startsWith("error.")) { + // This is hacky, but I can't imagine that it's not also accurate. + return { name: "othererror", error }; + } + error = ErrorSanitizer.cleanErrorMessage(error); + return { name: "unexpectederror", error }; + } + + if (error instanceof lazy.AuthenticationError) { + return { name: "autherror", from: error.source }; + } + + if (DOMException.isInstance(error)) { + return { + name: "unexpectederror", + error: ErrorSanitizer.cleanErrorMessage(error.message, error), + }; + } + + let httpCode = + error.status || (error.response && error.response.status) || error.code; + + if (httpCode) { + return { name: "httperror", code: httpCode }; + } + + if (error.failureCode) { + return { name: "othererror", error: error.failureCode }; + } + + if (error.result) { + // many "nsresult" errors are actually network errors - if they are + // associated with the "network" module we assume that's true. + // We also assume NS_ERROR_ABORT is such an error - for almost everything + // we care about, it acually is (eg, if the connection fails early enough + // or if we have a captive portal etc) - we don't lose anything by this + // assumption, it's just that the error will no longer be in the "nserror" + // category, so our analysis can still find them. + if ( + error.result == Cr.NS_ERROR_ABORT || + NS_ERROR_GET_MODULE(error.result) == NS_ERROR_MODULE_NETWORK + ) { + return { name: "httperror", code: error.result }; + } + return { name: "nserror", code: error.result }; + } + // It's probably an Error object, but it also could be some + // other object that may or may not override toString to do + // something useful. + let msg = String(error); + if (msg.startsWith("[object")) { + // Nothing useful in the default, check for a string "message" property. + if (typeof error.message == "string") { + msg = String(error.message); + } else { + // Hopefully it won't come to this... + msg = JSON.stringify(error); + } + } + return { + name: "unexpectederror", + error: ErrorSanitizer.cleanErrorMessage(msg), + }; + } +} + +export var SyncTelemetry = new SyncTelemetryImpl(ENGINES); |