diff options
Diffstat (limited to 'toolkit/components/crashes/CrashManager.in.sys.mjs')
-rw-r--r-- | toolkit/components/crashes/CrashManager.in.sys.mjs | 1616 |
1 files changed, 1616 insertions, 0 deletions
diff --git a/toolkit/components/crashes/CrashManager.in.sys.mjs b/toolkit/components/crashes/CrashManager.in.sys.mjs new file mode 100644 index 0000000000..253f70d07e --- /dev/null +++ b/toolkit/components/crashes/CrashManager.in.sys.mjs @@ -0,0 +1,1616 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { AppConstants } from "resource://gre/modules/AppConstants.sys.mjs"; +import { setTimeout } from "resource://gre/modules/Timer.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + Log: "resource://gre/modules/Log.sys.mjs", + TelemetryController: "resource://gre/modules/TelemetryController.sys.mjs", +}); + +/** + * How long to wait after application startup before crash event files are + * automatically aggregated. + * + * We defer aggregation for performance reasons, as we don't want too many + * services competing for I/O immediately after startup. + */ +const AGGREGATE_STARTUP_DELAY_MS = 57000; + +const MILLISECONDS_IN_DAY = 24 * 60 * 60 * 1000; + +// Converts Date to days since UNIX epoch. +// This was copied from /services/metrics.storage.jsm. The implementation +// does not account for leap seconds. +export function dateToDays(date) { + return Math.floor(date.getTime() / MILLISECONDS_IN_DAY); +} + +/** + * Get a field from the specified object and remove it. + * + * @param obj {Object} The object holding the field + * @param field {String} The name of the field to be parsed and removed + * + * @returns {String} the field contents as a string, null if none was found + */ +function getAndRemoveField(obj, field) { + let value = null; + + if (field in obj) { + value = obj[field]; + delete obj[field]; + } + + return value; +} + +/** + * Parse the string stored in the specified field as JSON and then remove the + * field from the object. + * + * @param obj {Object} The object holding the field + * @param field {String} The name of the field to be parsed and removed + * + * @returns {Object} the parsed object, null if none was found + */ +function parseAndRemoveField(obj, field) { + let value = null; + + if (field in obj) { + try { + value = JSON.parse(obj[field]); + } catch (e) { + console.error(e); + } + + delete obj[field]; + } + + return value; +} + +/** + * A gateway to crash-related data. + * + * This type is generic and can be instantiated any number of times. + * However, most applications will typically only have one instance + * instantiated and that instance will point to profile and user appdata + * directories. + * + * Instances are created by passing an object with properties. + * Recognized properties are: + * + * pendingDumpsDir (string) (required) + * Where dump files that haven't been uploaded are located. + * + * submittedDumpsDir (string) (required) + * Where records of uploaded dumps are located. + * + * eventsDirs (array) + * Directories (defined as strings) where events files are written. This + * instance will collects events from files in the directories specified. + * + * storeDir (string) + * Directory we will use for our data store. This instance will write + * data files into the directory specified. + * + * telemetryStoreSizeKey (string) + * Telemetry histogram to report store size under. + */ +export var CrashManager = function (options) { + for (let k in options) { + let value = options[k]; + + switch (k) { + case "pendingDumpsDir": + case "submittedDumpsDir": + case "eventsDirs": + case "storeDir": + let key = "_" + k; + delete this[key]; + Object.defineProperty(this, key, { value }); + break; + case "telemetryStoreSizeKey": + this._telemetryStoreSizeKey = value; + break; + + default: + throw new Error("Unknown property in options: " + k); + } + } + + // Promise for in-progress aggregation operation. We store it on the + // object so it can be returned for in-progress operations. + this._aggregatePromise = null; + + // Map of crash ID / promise tuples used to track adding new crashes. + this._crashPromises = new Map(); + + // Promise for the crash ping used only for testing. + this._pingPromise = null; + + // The CrashStore currently attached to this object. + this._store = null; + + // A Task to retrieve the store. This is needed to avoid races when + // _getStore() is called multiple times in a short interval. + this._getStoreTask = null; + + // The timer controlling the expiration of the CrashStore instance. + this._storeTimer = null; + + // This is a semaphore that prevents the store from being freed by our + // timer-based resource freeing mechanism. + this._storeProtectedCount = 0; +}; + +CrashManager.prototype = Object.freeze({ + // gen_CrashManager.py will input the proper process map informations. + /* SUBST: CRASH_MANAGER_PROCESS_MAP */ + + // A real crash. + CRASH_TYPE_CRASH: "crash", + + // A hang. + CRASH_TYPE_HANG: "hang", + + // Submission result values. + SUBMISSION_RESULT_OK: "ok", + SUBMISSION_RESULT_FAILED: "failed", + + DUMP_REGEX: + /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.dmp$/i, + SUBMITTED_REGEX: + /^bp-(?:hr-)?([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.txt$/i, + ALL_REGEX: /^(.*)$/, + + // How long the store object should persist in memory before being + // automatically garbage collected. + STORE_EXPIRATION_MS: 60 * 1000, + + // Number of days after which a crash with no activity will get purged. + PURGE_OLDER_THAN_DAYS: 180, + + // The following are return codes for individual event file processing. + // File processed OK. + EVENT_FILE_SUCCESS: "ok", + // The event appears to be malformed. + EVENT_FILE_ERROR_MALFORMED: "malformed", + // The event is obsolete. + EVENT_FILE_ERROR_OBSOLETE: "obsolete", + // The type of event is unknown. + EVENT_FILE_ERROR_UNKNOWN_EVENT: "unknown-event", + + _lazyGetDir(field, path, leaf) { + delete this[field]; + let value = PathUtils.join(path, leaf); + Object.defineProperty(this, field, { value }); + return value; + }, + + get _crDir() { + return this._lazyGetDir( + "_crDir", + Services.dirsvc.get("UAppData", Ci.nsIFile).path, + "Crash Reports" + ); + }, + + get _storeDir() { + return this._lazyGetDir( + "_storeDir", + Services.dirsvc.get("ProfD", Ci.nsIFile).path, + "crashes" + ); + }, + + get _pendingDumpsDir() { + return this._lazyGetDir("_pendingDumpsDir", this._crDir, "pending"); + }, + + get _submittedDumpsDir() { + return this._lazyGetDir("_submittedDumpsDir", this._crDir, "submitted"); + }, + + get _eventsDirs() { + delete this._eventsDirs; + let value = [ + PathUtils.join(this._crDir, "events"), + PathUtils.join(this._storeDir, "events"), + ]; + Object.defineProperty(this, "_eventsDirs", { value }); + return value; + }, + + /** + * Obtain a list of all dumps pending upload. + * + * The returned value is a promise that resolves to an array of objects + * on success. Each element in the array has the following properties: + * + * id (string) + * The ID of the crash (a UUID). + * + * path (string) + * The filename of the crash (<UUID.dmp>) + * + * date (Date) + * When this dump was created + * + * The returned arry is sorted by the modified time of the file backing + * the entry, oldest to newest. + * + * @return Promise<Array> + */ + pendingDumps() { + return this._getDirectoryEntries(this._pendingDumpsDir, this.DUMP_REGEX); + }, + + /** + * Obtain a list of all dump files corresponding to submitted crashes. + * + * The returned value is a promise that resolves to an Array of + * objects. Each object has the following properties: + * + * path (string) + * The path of the file this entry comes from. + * + * id (string) + * The crash UUID. + * + * date (Date) + * The (estimated) date this crash was submitted. + * + * The returned array is sorted by the modified time of the file backing + * the entry, oldest to newest. + * + * @return Promise<Array> + */ + submittedDumps() { + return this._getDirectoryEntries( + this._submittedDumpsDir, + this.SUBMITTED_REGEX + ); + }, + + /** + * Aggregates "loose" events files into the unified "database." + * + * This function should be called periodically to collect metadata from + * all events files into the central data store maintained by this manager. + * + * Once events have been stored in the backing store the corresponding + * source files are deleted. + * + * Only one aggregation operation is allowed to occur at a time. If this + * is called when an existing aggregation is in progress, the promise for + * the original call will be returned. + * + * @return promise<int> The number of event files that were examined. + */ + aggregateEventsFiles() { + if (this._aggregatePromise) { + return this._aggregatePromise; + } + + return (this._aggregatePromise = (async () => { + if (this._aggregatePromise) { + return this._aggregatePromise; + } + + try { + let unprocessedFiles = await this._getUnprocessedEventsFiles(); + + let deletePaths = []; + let needsSave = false; + + this._storeProtectedCount++; + for (let entry of unprocessedFiles) { + try { + let result = await this._processEventFile(entry); + + switch (result) { + case this.EVENT_FILE_SUCCESS: + needsSave = true; + // Fall through. + + case this.EVENT_FILE_ERROR_MALFORMED: + case this.EVENT_FILE_ERROR_OBSOLETE: + deletePaths.push(entry.path); + break; + + case this.EVENT_FILE_ERROR_UNKNOWN_EVENT: + break; + + default: + console.error( + "Unhandled crash event file return code. Please " + + "file a bug: ", + result + ); + } + } catch (ex) { + if (DOMException.isInstance(ex)) { + this._log.warn("I/O error reading " + entry.path, ex); + } else { + // We should never encounter an exception. This likely represents + // a coding error because all errors should be detected and + // converted to return codes. + // + // If we get here, report the error and delete the source file + // so we don't see it again. + console.error( + "Exception when processing crash event file: " + + lazy.Log.exceptionStr(ex) + ); + deletePaths.push(entry.path); + } + } + } + + if (needsSave) { + let store = await this._getStore(); + await store.save(); + } + + for (let path of deletePaths) { + try { + await IOUtils.remove(path); + } catch (ex) { + this._log.warn("Error removing event file (" + path + ")", ex); + } + } + + return unprocessedFiles.length; + } finally { + this._aggregatePromise = false; + this._storeProtectedCount--; + } + })()); + }, + + /** + * Prune old crash data. + * + * @param date + * (Date) The cutoff point for pruning. Crashes without data newer + * than this will be pruned. + */ + pruneOldCrashes(date) { + return (async () => { + let store = await this._getStore(); + store.pruneOldCrashes(date); + await store.save(); + })(); + }, + + /** + * Run tasks that should be periodically performed. + */ + runMaintenanceTasks() { + return (async () => { + await this.aggregateEventsFiles(); + + let offset = this.PURGE_OLDER_THAN_DAYS * MILLISECONDS_IN_DAY; + await this.pruneOldCrashes(new Date(Date.now() - offset)); + })(); + }, + + /** + * Schedule maintenance tasks for some point in the future. + * + * @param delay + * (integer) Delay in milliseconds when maintenance should occur. + */ + scheduleMaintenance(delay) { + let deferred = Promise.withResolvers(); + + setTimeout(() => { + this.runMaintenanceTasks().then(deferred.resolve, deferred.reject); + }, delay); + + return deferred.promise; + }, + + /** + * Record the occurrence of a crash. + * + * This method skips event files altogether and writes directly and + * immediately to the manager's data store. + * + * @param processType (string) One of the PROCESS_TYPE constants. + * @param crashType (string) One of the CRASH_TYPE constants. + * @param id (string) Crash ID. Likely a UUID. + * @param date (Date) When the crash occurred. + * @param metadata (dictionary) Crash metadata, may be empty. + * + * @return promise<null> Resolved when the store has been saved. + */ + addCrash(processType, crashType, id, date, metadata) { + let promise = (async () => { + if (!this.isValidProcessType(processType)) { + console.error( + "Unhandled process type. Please file a bug: '", + processType, + "'. Ignore in the context of " + + "test_crash_manager.js:test_addCrashWrong()." + ); + return; + } + + let store = await this._getStore(); + if (store.addCrash(processType, crashType, id, date, metadata)) { + await store.save(); + } + + let deferred = this._crashPromises.get(id); + + if (deferred) { + this._crashPromises.delete(id); + deferred.resolve(); + } + + if (this.isPingAllowed(processType)) { + this._sendCrashPing("crash", id, processType, date, metadata); + } + })(); + + return promise; + }, + + /** + * Check that the processType parameter is a valid one: + * - it is a string + * - it is listed in this.processTypes + * + * @param processType (string) Process type to evaluate + * + * @return boolean True or false depending whether it is a legit one + */ + isValidProcessType(processType) { + if (typeof processType !== "string") { + return false; + } + + for (const pt of Object.values(this.processTypes)) { + if (pt === processType) { + return true; + } + } + + return false; + }, + + /** + * Check that processType is allowed to send a ping + * + * @param processType (string) Process type to check for + * + * @return boolean True or False depending on whether ping is allowed + **/ + isPingAllowed(processType) { + // gen_CrashManager.py will input the proper process pings informations. + + let processPings = { + /* SUBST: CRASH_MANAGER_PROCESS_PINGS */ + }; + + // Should not even reach this because of isValidProcessType() but just in + // case we try to be cautious + if (!(processType in processPings)) { + return false; + } + + return processPings[processType]; + }, + + /** + * Returns a promise that is resolved only the crash with the specified id + * has been fully recorded. + * + * @param id (string) Crash ID. Likely a UUID. + * + * @return promise<null> Resolved when the crash is present. + */ + async ensureCrashIsPresent(id) { + let store = await this._getStore(); + let crash = store.getCrash(id); + + if (crash) { + return Promise.resolve(); + } + + let deferred = Promise.withResolvers(); + + this._crashPromises.set(id, deferred); + return deferred.promise; + }, + + /** + * Record the remote ID for a crash. + * + * @param crashID (string) Crash ID. Likely a UUID. + * @param remoteID (Date) Server/Breakpad ID. + * + * @return boolean True if the remote ID was recorded. + */ + async setRemoteCrashID(crashID, remoteID) { + let store = await this._getStore(); + if (store.setRemoteCrashID(crashID, remoteID)) { + await store.save(); + } + }, + + /** + * Generate a submission ID for use with addSubmission{Attempt,Result}. + */ + generateSubmissionID() { + return "sub-" + Services.uuid.generateUUID().toString().slice(1, -1); + }, + + /** + * Record the occurrence of a submission attempt for a crash. + * + * @param crashID (string) Crash ID. Likely a UUID. + * @param submissionID (string) Submission ID. Likely a UUID. + * @param date (Date) When the attempt occurred. + * + * @return boolean True if the attempt was recorded and false if not. + */ + async addSubmissionAttempt(crashID, submissionID, date) { + let store = await this._getStore(); + if (store.addSubmissionAttempt(crashID, submissionID, date)) { + await store.save(); + } + }, + + /** + * Record the occurrence of a submission result for a crash. + * + * @param crashID (string) Crash ID. Likely a UUID. + * @param submissionID (string) Submission ID. Likely a UUID. + * @param date (Date) When the submission result was obtained. + * @param result (string) One of the SUBMISSION_RESULT constants. + * + * @return boolean True if the result was recorded and false if not. + */ + async addSubmissionResult(crashID, submissionID, date, result) { + let store = await this._getStore(); + if (store.addSubmissionResult(crashID, submissionID, date, result)) { + await store.save(); + } + }, + + /** + * Set the classification of a crash. + * + * @param crashID (string) Crash ID. Likely a UUID. + * @param classifications (array) Crash classifications. + * + * @return boolean True if the data was recorded and false if not. + */ + async setCrashClassifications(crashID, classifications) { + let store = await this._getStore(); + if (store.setCrashClassifications(crashID, classifications)) { + await store.save(); + } + }, + + /** + * Obtain the paths of all unprocessed events files. + * + * The promise-resolved array is sorted by file mtime, oldest to newest. + */ + _getUnprocessedEventsFiles() { + return (async () => { + try { + let entries = []; + + for (let dir of this._eventsDirs) { + for (let e of await this._getDirectoryEntries(dir, this.ALL_REGEX)) { + entries.push(e); + } + } + + entries.sort((a, b) => { + return a.date - b.date; + }); + + return entries; + } catch (e) { + console.error(e); + return []; + } + })(); + }, + + // See docs/crash-events.rst for the file format specification. + _processEventFile(entry) { + return (async () => { + let data = await IOUtils.read(entry.path); + let store = await this._getStore(); + + let decoder = new TextDecoder(); + data = decoder.decode(data); + + let type, time; + let start = 0; + for (let i = 0; i < 2; i++) { + let index = data.indexOf("\n", start); + if (index == -1) { + return this.EVENT_FILE_ERROR_MALFORMED; + } + + let sub = data.substring(start, index); + switch (i) { + case 0: + type = sub; + break; + case 1: + time = sub; + try { + time = parseInt(time, 10); + } catch (ex) { + return this.EVENT_FILE_ERROR_MALFORMED; + } + } + + start = index + 1; + } + let date = new Date(time * 1000); + let payload = data.substring(start); + + return this._handleEventFilePayload(store, entry, type, date, payload); + })(); + }, + + _filterAnnotations(annotations) { + let filteredAnnotations = {}; + + for (let line in annotations) { + try { + if (Services.appinfo.isAnnotationAllowedForPing(line)) { + filteredAnnotations[line] = annotations[line]; + } + } catch (e) { + // Silently drop unknown annotations + } + } + + return filteredAnnotations; + }, + + /** + * Submit a Glean crash ping with the given parameters. + * + * @param {string} reason - the reason for the crash ping, one of: "crash", "event_found" + * @param {string} type - the process type (from {@link processTypes}) + * @param {DateTime} date - the time of the crash (or the closest time after it) + * @param {object} metadata - the object of Telemetry crash metadata + */ + _submitGleanCrashPing(reason, type, date, metadata) { + if ("UptimeTS" in metadata) { + Glean.crash.uptime.setRaw(parseFloat(metadata.UptimeTS) * 1e3); + } + Glean.crash.processType.set(type); + Glean.crash.time.set(date.getTime() * 1000); + Glean.crash.startup.set( + "StartupCrash" in metadata && parseInt(metadata.StartupCrash) === 1 + ); + GleanPings.crash.submit(reason); + }, + + /** + * Send a crash ping. + * + * @param {string} reason - the reason for the crash ping, one of: "crash", "event_found" + * @param {string} crashId - the crash identifier + * @param {string} type - the process type (from {@link processTypes}) + * @param {DateTime} date - the time of the crash (or the closest time after it) + * @param {object} metadata - Telemetry crash metadata + */ + _sendCrashPing(reason, crashId, type, date, metadata = {}) { + // If we have a saved environment, use it. Otherwise report + // the current environment. + let reportMeta = Cu.cloneInto(metadata, {}); + let crashEnvironment = parseAndRemoveField( + reportMeta, + "TelemetryEnvironment" + ); + let sessionId = getAndRemoveField(reportMeta, "TelemetrySessionId"); + let stackTraces = getAndRemoveField(reportMeta, "StackTraces"); + let minidumpSha256Hash = getAndRemoveField( + reportMeta, + "MinidumpSha256Hash" + ); + // If CrashPingUUID is present then a Telemetry ping was generated by the + // crashreporter for this crash so we only need to send the Glean ping. + let onlyGlean = getAndRemoveField(reportMeta, "CrashPingUUID"); + + // Filter the remaining annotations to remove privacy-sensitive ones + reportMeta = this._filterAnnotations(reportMeta); + + // Glean crash pings should not be sent on Android: they are handled + // separately in lib-crash for Fenix (and potentially other GeckoView + // users). + if (AppConstants.platform !== "android") { + this._submitGleanCrashPing(reason, type, date, reportMeta); + } + + if (onlyGlean) { + return; + } + + this._pingPromise = lazy.TelemetryController.submitExternalPing( + "crash", + { + version: 1, + crashDate: date.toISOString().slice(0, 10), // YYYY-MM-DD + crashTime: date.toISOString().slice(0, 13) + ":00:00.000Z", // per-hour resolution + sessionId, + crashId, + minidumpSha256Hash, + processType: type, + stackTraces, + metadata: reportMeta, + hasCrashEnvironment: crashEnvironment !== null, + }, + { + addClientId: true, + addEnvironment: true, + overrideEnvironment: crashEnvironment, + } + ); + }, + + _handleEventFilePayload(store, entry, type, date, payload) { + // The payload types and formats are documented in docs/crash-events.rst. + // Do not change the format of an existing type. Instead, invent a new + // type. + // DO NOT ADD NEW TYPES WITHOUT DOCUMENTING! + let lines = payload.split("\n"); + + switch (type) { + case "crash.main.1": + case "crash.main.2": + return this.EVENT_FILE_ERROR_OBSOLETE; + + case "crash.main.3": + let crashID = lines[0]; + let metadata = JSON.parse(lines[1]); + store.addCrash( + this.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_DEFAULT], + this.CRASH_TYPE_CRASH, + crashID, + date, + metadata + ); + + this._sendCrashPing( + "event_found", + crashID, + this.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_DEFAULT], + date, + metadata + ); + + break; + + case "crash.submission.1": + if (lines.length == 3) { + let [crashID, result, remoteID] = lines; + store.addCrash( + this.processTypes[Ci.nsIXULRuntime.PROCESS_TYPE_DEFAULT], + this.CRASH_TYPE_CRASH, + crashID, + date + ); + + let submissionID = this.generateSubmissionID(); + let succeeded = result === "true"; + store.addSubmissionAttempt(crashID, submissionID, date); + store.addSubmissionResult( + crashID, + submissionID, + date, + succeeded + ? this.SUBMISSION_RESULT_OK + : this.SUBMISSION_RESULT_FAILED + ); + if (succeeded) { + store.setRemoteCrashID(crashID, remoteID); + } + } else { + return this.EVENT_FILE_ERROR_MALFORMED; + } + break; + + default: + return this.EVENT_FILE_ERROR_UNKNOWN_EVENT; + } + + return this.EVENT_FILE_SUCCESS; + }, + + /** + * The resolved promise is an array of objects with the properties: + * + * path -- String filename + * id -- regexp.match()[1] (likely the crash ID) + * date -- Date mtime of the file + */ + _getDirectoryEntries(path, re) { + return (async function () { + let children = await IOUtils.getChildren(path); + let entries = []; + + for (const entry of children) { + let stat = await IOUtils.stat(entry); + if (stat.type == "directory") { + continue; + } + + let filename = PathUtils.filename(entry); + let match = re.exec(filename); + if (!match) { + continue; + } + entries.push({ + path: entry, + id: match[1], + date: stat.lastModified, + }); + } + + entries.sort((a, b) => { + return a.date - b.date; + }); + + return entries; + })(); + }, + + _getStore() { + if (this._getStoreTask) { + return this._getStoreTask; + } + + return (this._getStoreTask = (async () => { + try { + if (!this._store) { + await IOUtils.makeDirectory(this._storeDir, { + permissions: 0o700, + }); + + let store = new CrashStore( + this._storeDir, + this._telemetryStoreSizeKey + ); + await store.load(); + + this._store = store; + this._storeTimer = Cc["@mozilla.org/timer;1"].createInstance( + Ci.nsITimer + ); + } + + // The application can go long periods without interacting with the + // store. Since the store takes up resources, we automatically "free" + // the store after inactivity so resources can be returned to the + // system. We do this via a timer and a mechanism that tracks when the + // store is being accessed. + this._storeTimer.cancel(); + + // This callback frees resources from the store unless the store + // is protected from freeing by some other process. + let timerCB = () => { + if (this._storeProtectedCount) { + this._storeTimer.initWithCallback( + timerCB, + this.STORE_EXPIRATION_MS, + this._storeTimer.TYPE_ONE_SHOT + ); + return; + } + + // We kill the reference that we hold. GC will kill it later. If + // someone else holds a reference, that will prevent GC until that + // reference is gone. + this._store = null; + this._storeTimer = null; + }; + + this._storeTimer.initWithCallback( + timerCB, + this.STORE_EXPIRATION_MS, + this._storeTimer.TYPE_ONE_SHOT + ); + + return this._store; + } finally { + this._getStoreTask = null; + } + })()); + }, + + /** + * Obtain information about all known crashes. + * + * Returns an array of CrashRecord instances. Instances are read-only. + */ + getCrashes() { + return (async () => { + let store = await this._getStore(); + + return store.crashes; + })(); + }, + + getCrashCountsByDay() { + return (async () => { + let store = await this._getStore(); + + return store._countsByDay; + })(); + }, +}); + +var gCrashManager; + +/** + * Interface to storage of crash data. + * + * This type handles storage of crash metadata. It exists as a separate type + * from the crash manager for performance reasons: since all crash metadata + * needs to be loaded into memory for access, we wish to easily dispose of all + * associated memory when this data is no longer needed. Having an isolated + * object whose references can easily be lost faciliates that simple disposal. + * + * When metadata is updated, the caller must explicitly persist the changes + * to disk. This prevents excessive I/O during updates. + * + * The store has a mechanism for ensuring it doesn't grow too large. A ceiling + * is placed on the number of daily events that can occur for events that can + * occur with relatively high frequency. If we've reached + * the high water mark and new data arrives, it's silently dropped. + * However, the count of actual events is always preserved. This allows + * us to report on the severity of problems beyond the storage threshold. + * + * Main process crashes are excluded from limits because they are both + * important and should be rare. + * + * @param storeDir (string) + * Directory the store should be located in. + * @param telemetrySizeKey (string) + * The telemetry histogram that should be used to store the size + * of the data file. + */ +export function CrashStore(storeDir, telemetrySizeKey) { + this._storeDir = storeDir; + this._telemetrySizeKey = telemetrySizeKey; + + this._storePath = PathUtils.join(storeDir, "store.json.mozlz4"); + + // Holds the read data from disk. + this._data = null; + + // Maps days since UNIX epoch to a Map of event types to counts. + // This data structure is populated when the JSON file is loaded + // and is also updated when new events are added. + this._countsByDay = new Map(); +} + +CrashStore.prototype = Object.freeze({ + // Maximum number of events to store per day. This establishes a + // ceiling on the per-type/per-day records that will be stored. + HIGH_WATER_DAILY_THRESHOLD: 500, + + /** + * Reset all data. + */ + reset() { + this._data = { + v: 1, + crashes: new Map(), + corruptDate: null, + }; + this._countsByDay = new Map(); + }, + + /** + * Load data from disk. + * + * @return Promise + */ + load() { + return (async () => { + // Loading replaces data. + this.reset(); + + try { + let decoder = new TextDecoder(); + let data = await IOUtils.read(this._storePath, { decompress: true }); + data = JSON.parse(decoder.decode(data)); + + if (data.corruptDate) { + this._data.corruptDate = new Date(data.corruptDate); + } + + // actualCounts is used to validate that the derived counts by + // days stored in the payload matches up to actual data. + let actualCounts = new Map(); + + // In the past, submissions were stored as separate crash records + // with an id of e.g. "someID-submission". If we find IDs ending + // with "-submission", we will need to convert the data to be stored + // as actual submissions. + // + // The old way of storing submissions was used from FF33 - FF34. We + // drop this old data on the floor. + for (let id in data.crashes) { + if (id.endsWith("-submission")) { + continue; + } + + let crash = data.crashes[id]; + let denormalized = this._denormalize(crash); + + denormalized.submissions = new Map(); + if (crash.submissions) { + for (let submissionID in crash.submissions) { + let submission = crash.submissions[submissionID]; + denormalized.submissions.set( + submissionID, + this._denormalize(submission) + ); + } + } + + this._data.crashes.set(id, denormalized); + + let key = + dateToDays(denormalized.crashDate) + "-" + denormalized.type; + actualCounts.set(key, (actualCounts.get(key) || 0) + 1); + + // If we have an OOM size, count the crash as an OOM in addition to + // being a main process crash. + if ( + denormalized.metadata && + denormalized.metadata.OOMAllocationSize + ) { + let oomKey = key + "-oom"; + actualCounts.set(oomKey, (actualCounts.get(oomKey) || 0) + 1); + } + } + + // The validation in this loop is arguably not necessary. We perform + // it as a defense against unknown bugs. + for (let dayKey in data.countsByDay) { + let day = parseInt(dayKey, 10); + for (let type in data.countsByDay[day]) { + this._ensureCountsForDay(day); + + let count = data.countsByDay[day][type]; + let key = day + "-" + type; + + // If the payload says we have data for a given day but we + // don't, the payload is wrong. Ignore it. + if (!actualCounts.has(key)) { + continue; + } + + // If we encountered more data in the payload than what the + // data structure says, use the proper value. + count = Math.max(count, actualCounts.get(key)); + + this._countsByDay.get(day).set(type, count); + } + } + } catch (ex) { + // Missing files (first use) are allowed. + if (!DOMException.isInstance(ex) || ex.name != "NotFoundError") { + // If we can't load for any reason, mark a corrupt date in the instance + // and swallow the error. + // + // The marking of a corrupted file is intentionally not persisted to + // disk yet. Instead, we wait until the next save(). This is to give + // non-permanent failures the opportunity to recover on their own. + this._data.corruptDate = new Date(); + } + } + })(); + }, + + /** + * Save data to disk. + * + * @return Promise<null> + */ + save() { + return (async () => { + if (!this._data) { + return; + } + + let normalized = { + // The version should be incremented whenever the format + // changes. + v: 1, + // Maps crash IDs to objects defining the crash. + crashes: {}, + // Maps days since UNIX epoch to objects mapping event types to + // counts. This is a mirror of this._countsByDay. e.g. + // { + // 15000: { + // "main-crash": 2, + // "plugin-crash": 1 + // } + // } + countsByDay: {}, + + // When the store was last corrupted. + corruptDate: null, + }; + + if (this._data.corruptDate) { + normalized.corruptDate = this._data.corruptDate.getTime(); + } + + for (let [id, crash] of this._data.crashes) { + let c = this._normalize(crash); + + c.submissions = {}; + for (let [submissionID, submission] of crash.submissions) { + c.submissions[submissionID] = this._normalize(submission); + } + + normalized.crashes[id] = c; + } + + for (let [day, m] of this._countsByDay) { + normalized.countsByDay[day] = {}; + for (let [type, count] of m) { + normalized.countsByDay[day][type] = count; + } + } + + let encoder = new TextEncoder(); + let data = encoder.encode(JSON.stringify(normalized)); + let size = await IOUtils.write(this._storePath, data, { + tmpPath: this._storePath + ".tmp", + compress: true, + }); + if (this._telemetrySizeKey) { + Services.telemetry.getHistogramById(this._telemetrySizeKey).add(size); + } + })(); + }, + + /** + * Normalize an object into one fit for serialization. + * + * This function along with _denormalize() serve to hack around the + * default handling of Date JSON serialization because Date serialization + * is undefined by JSON. + * + * Fields ending with "Date" are assumed to contain Date instances. + * We convert these to milliseconds since epoch on output and back to + * Date on input. + */ + _normalize(o) { + let normalized = {}; + + for (let k in o) { + let v = o[k]; + if (v && k.endsWith("Date")) { + normalized[k] = v.getTime(); + } else { + normalized[k] = v; + } + } + + return normalized; + }, + + /** + * Convert a serialized object back to its native form. + */ + _denormalize(o) { + let n = {}; + + for (let k in o) { + let v = o[k]; + if (v && k.endsWith("Date")) { + n[k] = new Date(parseInt(v, 10)); + } else { + n[k] = v; + } + } + + return n; + }, + + /** + * Prune old crash data. + * + * Crashes without recent activity are pruned from the store so the + * size of the store is not unbounded. If there is activity on a crash, + * that activity will keep the crash and all its data around for longer. + * + * @param date + * (Date) The cutoff at which data will be pruned. If an entry + * doesn't have data newer than this, it will be pruned. + */ + pruneOldCrashes(date) { + for (let crash of this.crashes) { + let newest = crash.newestDate; + if (!newest || newest.getTime() < date.getTime()) { + this._data.crashes.delete(crash.id); + } + } + }, + + /** + * Date the store was last corrupted and required a reset. + * + * May be null (no corruption has ever occurred) or a Date instance. + */ + get corruptDate() { + return this._data.corruptDate; + }, + + /** + * The number of distinct crashes tracked. + */ + get crashesCount() { + return this._data.crashes.size; + }, + + /** + * All crashes tracked. + * + * This is an array of CrashRecord. + */ + get crashes() { + let crashes = []; + for (let [, crash] of this._data.crashes) { + crashes.push(new CrashRecord(crash)); + } + + return crashes; + }, + + /** + * Obtain a particular crash from its ID. + * + * A CrashRecord will be returned if the crash exists. null will be returned + * if the crash is unknown. + */ + getCrash(id) { + for (let crash of this.crashes) { + if (crash.id == id) { + return crash; + } + } + + return null; + }, + + _ensureCountsForDay(day) { + if (!this._countsByDay.has(day)) { + this._countsByDay.set(day, new Map()); + } + }, + + /** + * Ensure the crash record is present in storage. + * + * Returns the crash record if we're allowed to store it or null + * if we've hit the high water mark. + * + * @param processType + * (string) One of the PROCESS_TYPE constants. + * @param crashType + * (string) One of the CRASH_TYPE constants. + * @param id + * (string) The crash ID. + * @param date + * (Date) When this crash occurred. + * @param metadata + * (dictionary) Crash metadata, may be empty. + * + * @return null | object crash record + */ + _ensureCrashRecord(processType, crashType, id, date, metadata) { + if (!id) { + // Crashes are keyed on ID, so it's not really helpful to store crashes + // without IDs. + return null; + } + + let type = processType + "-" + crashType; + + if (!this._data.crashes.has(id)) { + let day = dateToDays(date); + this._ensureCountsForDay(day); + + let count = (this._countsByDay.get(day).get(type) || 0) + 1; + this._countsByDay.get(day).set(type, count); + + if ( + count > this.HIGH_WATER_DAILY_THRESHOLD && + processType != + CrashManager.prototype.processTypes[ + Ci.nsIXULRuntime.PROCESS_TYPE_DEFAULT + ] + ) { + return null; + } + + // If we have an OOM size, count the crash as an OOM in addition to + // being a main process crash. + if (metadata && metadata.OOMAllocationSize) { + let oomType = type + "-oom"; + let oomCount = (this._countsByDay.get(day).get(oomType) || 0) + 1; + this._countsByDay.get(day).set(oomType, oomCount); + } + + this._data.crashes.set(id, { + id, + remoteID: null, + type, + crashDate: date, + submissions: new Map(), + classifications: [], + metadata, + }); + } + + let crash = this._data.crashes.get(id); + crash.type = type; + crash.crashDate = date; + + return crash; + }, + + /** + * Record the occurrence of a crash. + * + * @param processType (string) One of the PROCESS_TYPE constants. + * @param crashType (string) One of the CRASH_TYPE constants. + * @param id (string) Crash ID. Likely a UUID. + * @param date (Date) When the crash occurred. + * @param metadata (dictionary) Crash metadata, may be empty. + * + * @return boolean True if the crash was recorded and false if not. + */ + addCrash(processType, crashType, id, date, metadata) { + return !!this._ensureCrashRecord( + processType, + crashType, + id, + date, + metadata + ); + }, + + /** + * @return boolean True if the remote ID was recorded and false if not. + */ + setRemoteCrashID(crashID, remoteID) { + let crash = this._data.crashes.get(crashID); + if (!crash || !remoteID) { + return false; + } + + crash.remoteID = remoteID; + return true; + }, + + /** + * @param processType (string) One of the PROCESS_TYPE constants. + * @param crashType (string) One of the CRASH_TYPE constants. + * + * @return array of crashes + */ + getCrashesOfType(processType, crashType) { + let crashes = []; + for (let crash of this.crashes) { + if (crash.isOfType(processType, crashType)) { + crashes.push(crash); + } + } + + return crashes; + }, + + /** + * Ensure the submission record is present in storage. + * @returns [submission, crash] + */ + _ensureSubmissionRecord(crashID, submissionID) { + let crash = this._data.crashes.get(crashID); + if (!crash || !submissionID) { + return null; + } + + if (!crash.submissions.has(submissionID)) { + crash.submissions.set(submissionID, { + requestDate: null, + responseDate: null, + result: null, + }); + } + + return [crash.submissions.get(submissionID), crash]; + }, + + /** + * @return boolean True if the attempt was recorded. + */ + addSubmissionAttempt(crashID, submissionID, date) { + let [submission, crash] = this._ensureSubmissionRecord( + crashID, + submissionID + ); + if (!submission) { + return false; + } + + submission.requestDate = date; + Services.telemetry + .getKeyedHistogramById("PROCESS_CRASH_SUBMIT_ATTEMPT") + .add(crash.type, 1); + return true; + }, + + /** + * @return boolean True if the response was recorded. + */ + addSubmissionResult(crashID, submissionID, date, result) { + let crash = this._data.crashes.get(crashID); + if (!crash || !submissionID) { + return false; + } + let submission = crash.submissions.get(submissionID); + if (!submission) { + return false; + } + + submission.responseDate = date; + submission.result = result; + Services.telemetry + .getKeyedHistogramById("PROCESS_CRASH_SUBMIT_SUCCESS") + .add(crash.type, result == "ok"); + return true; + }, + + /** + * @return boolean True if the classifications were set. + */ + setCrashClassifications(crashID, classifications) { + let crash = this._data.crashes.get(crashID); + if (!crash) { + return false; + } + + crash.classifications = classifications; + return true; + }, +}); + +/** + * Represents an individual crash with metadata. + * + * This is a wrapper around the low-level anonymous JS objects that define + * crashes. It exposes a consistent and helpful API. + * + * Instances of this type should only be constructured inside this module, + * not externally. The constructor is not considered a public API. + * + * @param o (object) + * The crash's entry from the CrashStore. + */ +function CrashRecord(o) { + this._o = o; +} + +CrashRecord.prototype = Object.freeze({ + get id() { + return this._o.id; + }, + + get remoteID() { + return this._o.remoteID; + }, + + get crashDate() { + return this._o.crashDate; + }, + + /** + * Obtain the newest date in this record. + * + * This is a convenience getter. The returned value is used to determine when + * to expire a record. + */ + get newestDate() { + // We currently only have 1 date, so this is easy. + return this._o.crashDate; + }, + + get oldestDate() { + return this._o.crashDate; + }, + + get type() { + return this._o.type; + }, + + isOfType(processType, crashType) { + return processType + "-" + crashType == this.type; + }, + + get submissions() { + return this._o.submissions; + }, + + get classifications() { + return this._o.classifications; + }, + + get metadata() { + return this._o.metadata; + }, +}); + +ChromeUtils.defineLazyGetter(CrashManager, "_log", () => + lazy.Log.repository.getLogger("Crashes.CrashManager") +); + +/** + * Obtain the global CrashManager instance used by the running application. + * + * CrashManager is likely only ever instantiated once per application lifetime. + * The main reason it's implemented as a reusable type is to facilitate testing. + */ +ChromeUtils.defineLazyGetter(CrashManager, "Singleton", function () { + if (gCrashManager) { + return gCrashManager; + } + + gCrashManager = new CrashManager({ + telemetryStoreSizeKey: "CRASH_STORE_COMPRESSED_BYTES", + }); + + // Automatically aggregate event files shortly after startup. This + // ensures it happens with some frequency. + // + // There are performance considerations here. While this is doing + // work and could negatively impact performance, the amount of work + // is kept small per run by periodically aggregating event files. + // Furthermore, well-behaving installs should not have much work + // here to do. If there is a lot of work, that install has bigger + // issues beyond reduced performance near startup. + gCrashManager.scheduleMaintenance(AGGREGATE_STARTUP_DELAY_MS); + + return gCrashManager; +}); + +export function getCrashManager() { + return CrashManager.Singleton; +} + +/** + * Used for tests to check the crash manager is created on profile creation. + * + * @returns {CrashManager} + */ +export function getCrashManagerNoCreate() { + return gCrashManager; +} |