diff options
Diffstat (limited to 'toolkit/components/terminator')
9 files changed, 1104 insertions, 0 deletions
diff --git a/toolkit/components/terminator/TerminatorTelemetry.sys.mjs b/toolkit/components/terminator/TerminatorTelemetry.sys.mjs new file mode 100644 index 0000000000..9903027436 --- /dev/null +++ b/toolkit/components/terminator/TerminatorTelemetry.sys.mjs @@ -0,0 +1,119 @@ +/* -*- indent-tabs-mode: nil; js-indent-level: 2 -*- */ +/* vim: set ts=2 et sw=2 tw=80 filetype=javascript: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * Read the data saved by nsTerminator during shutdown and feed it to the + * relevant telemetry histograms. + */ + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + setTimeout: "resource://gre/modules/Timer.sys.mjs", +}); + +export function nsTerminatorTelemetry() { + this._wasNotified = false; + this._deferred = Promise.withResolvers(); + + IOUtils.sendTelemetry.addBlocker( + "TerminatoryTelemetry: Waiting to submit telemetry", + this._deferred.promise, + () => ({ + wasNotified: this._wasNotified, + }) + ); +} + +var HISTOGRAMS = { + "quit-application": "SHUTDOWN_PHASE_DURATION_TICKS_QUIT_APPLICATION", + "profile-change-net-teardown": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_CHANGE_NET_TEARDOWN", + "profile-change-teardown": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_CHANGE_TEARDOWN", + "profile-before-change": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_BEFORE_CHANGE", + "profile-before-change-qm": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_BEFORE_CHANGE_QM", + "xpcom-will-shutdown": "SHUTDOWN_PHASE_DURATION_TICKS_XPCOM_WILL_SHUTDOWN", + "xpcom-shutdown": "SHUTDOWN_PHASE_DURATION_TICKS_XPCOM_SHUTDOWN", + + // The following keys appear in the JSON, but do not have associated + // histograms. + "xpcom-shutdown-threads": null, + XPCOMShutdownFinal: null, + CCPostLastCycleCollection: null, +}; + +nsTerminatorTelemetry.prototype = { + classID: Components.ID("{3f78ada1-cba2-442a-82dd-d5fb300ddea7}"), + + // nsISupports + + QueryInterface: ChromeUtils.generateQI(["nsIObserver"]), + + // nsIObserver + + observe: function DS_observe(aSubject, aTopic, aData) { + this._wasNotified = true; + + (async () => { + try { + // + // This data is hardly critical, reading it can wait for a few seconds. + // + await new Promise(resolve => lazy.setTimeout(resolve, 3000)); + + let PATH = PathUtils.join( + Services.dirsvc.get("ProfLD", Ci.nsIFile).path, + "ShutdownDuration.json" + ); + let data; + try { + data = await IOUtils.readJSON(PATH); + } catch (ex) { + if (DOMException.isInstance(ex) && ex.name == "NotFoundError") { + return; + } + // Let other errors be reported by Promise's error-reporting. + throw ex; + } + + // Clean up + await IOUtils.remove(PATH); + await IOUtils.remove(PATH + ".tmp"); + + for (let k of Object.keys(data)) { + let id = HISTOGRAMS[k]; + if (id === null) { + // No histogram associated with this entry. + continue; + } + + try { + let histogram = Services.telemetry.getHistogramById(id); + histogram.add(Number.parseInt(data[k])); + } catch (ex) { + // Make sure that the error is reported and causes test failures, + // but otherwise, ignore it. + Promise.reject(ex); + continue; + } + } + + // Inform observers that we are done. + Services.obs.notifyObservers( + null, + "shutdown-terminator-telemetry-updated" + ); + } finally { + this._deferred.resolve(); + } + })(); + }, +}; + +// Module diff --git a/toolkit/components/terminator/components.conf b/toolkit/components/terminator/components.conf new file mode 100644 index 0000000000..14c4c2c562 --- /dev/null +++ b/toolkit/components/terminator/components.conf @@ -0,0 +1,15 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +Classes = [ + { + 'cid': '{3f78ada1-cba2-442a-82dd-d5fb300ddea7}', + 'contract_ids': ['@mozilla.org/toolkit/shutdown-terminator-telemetry;1'], + 'esModule': 'resource://gre/modules/TerminatorTelemetry.sys.mjs', + 'constructor': 'nsTerminatorTelemetry', + 'categories': {'profile-after-change': 'nsTerminatorTelemetry'}, + }, +] diff --git a/toolkit/components/terminator/moz.build b/toolkit/components/terminator/moz.build new file mode 100644 index 0000000000..f869b86d95 --- /dev/null +++ b/toolkit/components/terminator/moz.build @@ -0,0 +1,32 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +with Files("**"): + BUG_COMPONENT = ("Toolkit", "Async Tooling") + +XPCSHELL_TESTS_MANIFESTS += ["tests/xpcshell/xpcshell.toml"] + +SOURCES += [ + "nsTerminator.cpp", +] + +EXPORTS += [ + "nsTerminator.h", +] + +EXTRA_COMPONENTS += [ + "terminator.manifest", +] + +EXTRA_JS_MODULES += [ + "TerminatorTelemetry.sys.mjs", +] + +XPCOM_MANIFESTS += [ + "components.conf", +] + +FINAL_LIBRARY = "xul" diff --git a/toolkit/components/terminator/nsTerminator.cpp b/toolkit/components/terminator/nsTerminator.cpp new file mode 100644 index 0000000000..3a8f9f4ba6 --- /dev/null +++ b/toolkit/components/terminator/nsTerminator.cpp @@ -0,0 +1,614 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: sw=2 ts=2 et lcs=trail\:.,tab\:>~ : + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * A watchdog designed to terminate shutdown if it lasts too long. + * + * This watchdog is designed as a worst-case problem container for the + * common case in which Firefox just won't shutdown. + * + * We spawn a thread during quit-application. If any of the shutdown + * steps takes more than n milliseconds (63000 by default), kill the + * process as fast as possible, without any cleanup. + */ + +#include "mozilla/ShutdownPhase.h" +#include "nsTerminator.h" + +#include "prthread.h" +#include "prmon.h" +#include "prio.h" + +#include "nsString.h" +#include "nsDirectoryServiceUtils.h" +#include "nsAppDirectoryServiceDefs.h" + +#include "nsExceptionHandler.h" +#include "GeckoProfiler.h" +#include "nsThreadUtils.h" +#include "nsXULAppAPI.h" + +#if defined(XP_WIN) +# include <windows.h> +#else +# include <unistd.h> +#endif + +#include "mozilla/AppShutdown.h" +#include "mozilla/ArrayUtils.h" +#include "mozilla/Atomics.h" +#include "mozilla/DebugOnly.h" +#include "mozilla/IntentionalCrash.h" +#include "mozilla/MemoryChecking.h" +#include "mozilla/Preferences.h" +#include "mozilla/SpinEventLoopUntil.h" +#include "mozilla/UniquePtr.h" +#include "mozilla/Unused.h" +#include "mozilla/Telemetry.h" + +#include "mozilla/dom/IOUtils.h" +#include "mozilla/dom/workerinternals/RuntimeService.h" + +// Normally, the number of milliseconds that AsyncShutdown waits until +// it decides to crash is specified as a preference. We use the +// following value as a fallback if for some reason the preference is +// absent. +#define FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS 60000 + +// Additional number of milliseconds to wait until we decide to exit +// forcefully. +#define ADDITIONAL_WAIT_BEFORE_CRASH_MS 3000 + +#define HEARTBEAT_INTERVAL_MS 100 + +namespace mozilla { + +namespace { + +/** + * A step during shutdown. + * + * Shutdown is divided in steps, which all map to an observer + * notification. The duration of a step is defined as the number of + * ticks between the time we receive a notification and the next one. + */ +struct ShutdownStep { + mozilla::ShutdownPhase mPhase; + Atomic<int> mTicks; + + constexpr explicit ShutdownStep(mozilla::ShutdownPhase aPhase) + : mPhase(aPhase), mTicks(-1) {} +}; + +static ShutdownStep sShutdownSteps[] = { + ShutdownStep(mozilla::ShutdownPhase::AppShutdownConfirmed), + ShutdownStep(mozilla::ShutdownPhase::AppShutdownNetTeardown), + ShutdownStep(mozilla::ShutdownPhase::AppShutdownTeardown), + ShutdownStep(mozilla::ShutdownPhase::AppShutdown), + ShutdownStep(mozilla::ShutdownPhase::AppShutdownQM), + ShutdownStep(mozilla::ShutdownPhase::XPCOMWillShutdown), + ShutdownStep(mozilla::ShutdownPhase::XPCOMShutdown), + ShutdownStep(mozilla::ShutdownPhase::XPCOMShutdownThreads), + ShutdownStep(mozilla::ShutdownPhase::XPCOMShutdownFinal), + ShutdownStep(mozilla::ShutdownPhase::CCPostLastCycleCollection), +}; + +int GetStepForPhase(mozilla::ShutdownPhase aPhase) { + for (size_t i = 0; i < std::size(sShutdownSteps); i++) { + if (sShutdownSteps[i].mPhase >= aPhase) { + return (int)i; + } + } + return -1; +} + +// Utility function: create a thread that is non-joinable, +// does not prevent the process from terminating, is never +// cooperatively scheduled, and uses a default stack size. +PRThread* CreateSystemThread(void (*start)(void* arg), void* arg) { + PRThread* thread = + PR_CreateThread(PR_SYSTEM_THREAD, /* This thread will not prevent the + process from terminating */ + start, arg, PR_PRIORITY_LOW, + PR_GLOBAL_THREAD, /* Make sure that the thread is never + cooperatively scheduled */ + PR_UNJOINABLE_THREAD, 0 /* Use default stack size */ + ); + MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT( + thread); // This pointer will never be deallocated. + return thread; +} + +//////////////////////////////////////////// +// +// The watchdog +// +// This nspr thread is in charge of crashing the process if any stage of +// shutdown lasts more than some predefined duration. As a side-effect, it +// measures the duration of each stage of shutdown. +// + +// The heartbeat of the operation. +// +// Main thread: +// +// * Whenever a shutdown step has been completed, the main thread +// swaps gHeartbeat to 0 to mark that the shutdown process is still +// progressing. The value swapped away indicates the number of ticks +// it took for the shutdown step to advance. +// +// Watchdog thread: +// +// * Every tick, the watchdog thread increments gHearbeat atomically. +// +// A note about precision: +// Since gHeartbeat is generally reset to 0 between two ticks, this means +// that gHeartbeat stays at 0 less than one tick. Consequently, values +// extracted from gHeartbeat must be considered rounded up. +Atomic<uint32_t> gHeartbeat(0); + +struct Options { + /** + * How many ticks before we should crash the process. + */ + uint32_t crashAfterTicks; +}; + +/** + * Entry point for the watchdog thread + */ +void RunWatchdog(void* arg) { + NS_SetCurrentThreadName("Shutdown Hang Terminator"); + + // Let's copy and deallocate options, that's one less leak to worry + // about. + UniquePtr<Options> options((Options*)arg); + uint32_t crashAfterTicks = options->crashAfterTicks; + options = nullptr; + + const uint32_t timeToLive = crashAfterTicks; + while (true) { + // + // We do not want to sleep for the entire duration, + // as putting the computer to sleep would suddenly + // cause us to timeout on wakeup. + // + // Rather, we prefer sleeping for at most 1 second + // at a time. If the computer sleeps then wakes up, + // we have lost at most one second, which is much + // more reasonable. + // +#if defined(XP_WIN) + Sleep(HEARTBEAT_INTERVAL_MS /* ms */); +#else + usleep(HEARTBEAT_INTERVAL_MS * 1000 /* usec */); +#endif + + if (gHeartbeat++ < timeToLive) { + continue; + } + + // Arrived here we know we will crash in a way or another. + NoteIntentionalCrash(XRE_GetProcessTypeString()); + + // Until we have general log output for crash annotations in treeherder + // (bug 1728721) we manually spit out our nested event loop stack. + // XXX: Remove once bug 1728721 is fixed. + nsCString stack; + AutoNestedEventLoopAnnotation::CopyCurrentStack(stack); + printf_stderr( + "RunWatchdog: Mainthread nested event loops during hang: \n --- %s\n", + stack.get()); + + // Let's find the last known shutdown phase. + mozilla::ShutdownPhase lastPhase = mozilla::ShutdownPhase::NotInShutdown; + // Looping inverse here to make the search more robust in case + // the observer that triggers UpdateHeartbeat was not called + // at all or in the expected order on some step. This should + // give us always the last known ShutdownStep. + for (int i = ArrayLength(sShutdownSteps) - 1; i >= 0; --i) { + if (sShutdownSteps[i].mTicks > -1) { + lastPhase = sShutdownSteps[i].mPhase; + break; + } + } + + if (lastPhase == mozilla::ShutdownPhase::NotInShutdown) { + // This is not something we expect to ever happen, but still. + CrashReporter::SetMinidumpAnalysisAllThreads(); + MOZ_CRASH("Shutdown hanging before starting any known phase."); + } + + // First check if worker shutdown started and is incomplete, in case + // report running workers. + mozilla::dom::workerinternals::RuntimeService* runtimeService = + mozilla::dom::workerinternals::RuntimeService::GetService(); + if (runtimeService) { + // CrashIfHanging will check if we actually ever asked for worker + // shutdown, so calling it before is a no-op. + runtimeService->CrashIfHanging(); + } + + // Otherwise just report our shutdown phase. + // This string will be leaked. + nsCString msg; + msg.AppendPrintf( + "Shutdown hanging at step %s. " + "Something is blocking the main-thread.", + mozilla::AppShutdown::GetShutdownPhaseName(lastPhase)); + + CrashReporter::SetMinidumpAnalysisAllThreads(); + MOZ_CRASH_UNSAFE(strdup(msg.BeginReading())); + } +} + +//////////////////////////////////////////// +// +// Writer thread +// +// This nspr thread is in charge of writing to disk statistics produced by the +// watchdog thread and collected by the main thread. Note that we use a nspr +// thread rather than usual XPCOM I/O simply because we outlive XPCOM and its +// threads. +// + +// +// Communication between the main thread and the writer thread. +// +// Main thread: +// +// * Whenever a shutdown step has been completed, the main thread +// obtains the number of ticks from the watchdog threads, builds +// a string representing all the data gathered so far, places +// this string in `gWriteData`, and wakes up the writer thread +// using `gWriteReady`. If `gWriteData` already contained a non-null +// pointer, this means that the writer thread is lagging behind the +// main thread, and the main thread cleans up the memory. +// +// Writer thread: +// +// * When awake, the writer thread swaps `gWriteData` to nullptr. If +// `gWriteData` contained data to write, the . If so, the writer +// thread writes the data to a file named "ShutdownDuration.json.tmp", +// then moves that file to "ShutdownDuration.json" and cleans up the +// data. If `gWriteData` contains a nullptr, the writer goes to sleep +// until it is awkened using `gWriteReady`. +// +// +// The data written by the writer thread will be read by another +// module upon the next restart and fed to Telemetry. +// +Atomic<nsCString*> gWriteData(nullptr); +PRMonitor* gWriteReady = nullptr; + +void RunWriter(void* arg) { + AUTO_PROFILER_REGISTER_THREAD("Shutdown Statistics Writer"); + NS_SetCurrentThreadName("Shutdown Statistics Writer"); + + MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(arg); + // Shutdown will generally complete before we have a chance to + // deallocate. This is not a leak. + + // Setup destinationPath and tmpFilePath + + nsCString destinationPath; + destinationPath.Adopt(static_cast<char*>(arg)); + nsAutoCString tmpFilePath; + tmpFilePath.Append(destinationPath); + tmpFilePath.AppendLiteral(".tmp"); + + // Cleanup any file leftover from a previous run + Unused << PR_Delete(tmpFilePath.get()); + Unused << PR_Delete(destinationPath.get()); + + while (true) { + // + // Check whether we have received data from the main thread. + // + // We perform the check before waiting on `gWriteReady` as we may + // have received data while we were busy writing. + // + // Also note that gWriteData may have been modified several times + // since we last checked. That's ok, we are not losing any important + // data (since we keep adding data), and we are not leaking memory + // (since the main thread deallocates any data that hasn't been + // consumed by the writer thread). + // + UniquePtr<nsCString> data(gWriteData.exchange(nullptr)); + if (!data) { + // Data is not available yet. + // Wait until the main thread provides it. + PR_EnterMonitor(gWriteReady); + PR_Wait(gWriteReady, PR_INTERVAL_NO_TIMEOUT); + PR_ExitMonitor(gWriteReady); + continue; + } + + MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(data.get()); + // Shutdown may complete before we have a chance to deallocate. + // This is not a leak. + + // + // Write to a temporary file + // + // In case of any error, we simply give up. Since the data is + // hardly critical, we don't want to spend too much effort + // salvaging it. + // + UniquePtr<PRFileDesc, PR_CloseDelete> tmpFileDesc(PR_Open( + tmpFilePath.get(), PR_WRONLY | PR_TRUNCATE | PR_CREATE_FILE, 00600)); + + // Shutdown may complete before we have a chance to close the file. + // This is not a leak. + MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(tmpFileDesc.get()); + + if (tmpFileDesc == nullptr) { + break; + } + if (PR_Write(tmpFileDesc.get(), data->get(), data->Length()) == -1) { + break; + } + tmpFileDesc.reset(); + + // + // Rename on top of destination file. + // + // This is not sufficient to guarantee that the destination file + // will be written correctly, but, again, we don't care enough + // about the data to make more efforts. + // + Unused << PR_Delete(destinationPath.get()); + if (PR_Rename(tmpFilePath.get(), destinationPath.get()) != PR_SUCCESS) { + break; + } + } +} + +} // namespace + +NS_IMPL_ISUPPORTS(nsTerminator, nsIObserver) + +nsTerminator::nsTerminator() : mInitialized(false), mCurrentStep(-1) {} + +// Actually launch these threads. This takes place at the first sign of +// shutdown. +void nsTerminator::Start() { + MOZ_ASSERT(!mInitialized); + + StartWatchdog(); +#if !defined(NS_FREE_PERMANENT_DATA) + // Only allow nsTerminator to write on non-leak-checked builds so we don't + // get leak warnings on shutdown for intentional leaks (see bug 1242084). + // This will be enabled again by bug 1255484 when 1255478 lands. + StartWriter(); +#endif // !defined(NS_FREE_PERMANENT_DATA) + mInitialized = true; +} + +NS_IMETHODIMP +nsTerminator::Observe(nsISupports*, const char* aTopic, const char16_t*) { + // This Observe is now only used for testing purposes. + // XXX: Check if we should change our testing strategy. + if (strcmp(aTopic, "terminator-test-quit-application") == 0) { + AdvancePhase(mozilla::ShutdownPhase::AppShutdownConfirmed); + } else if (strcmp(aTopic, "terminator-test-profile-change-net-teardown") == + 0) { + AdvancePhase(mozilla::ShutdownPhase::AppShutdownNetTeardown); + } else if (strcmp(aTopic, "terminator-test-profile-change-teardown") == 0) { + AdvancePhase(mozilla::ShutdownPhase::AppShutdownTeardown); + } else if (strcmp(aTopic, "terminator-test-profile-before-change") == 0) { + AdvancePhase(mozilla::ShutdownPhase::AppShutdown); + } else if (strcmp(aTopic, "terminator-test-profile-before-change-qm") == 0) { + AdvancePhase(mozilla::ShutdownPhase::AppShutdownQM); + } else if (strcmp(aTopic, + "terminator-test-profile-before-change-telemetry") == 0) { + AdvancePhase(mozilla::ShutdownPhase::AppShutdownTelemetry); + } else if (strcmp(aTopic, "terminator-test-xpcom-will-shutdown") == 0) { + AdvancePhase(mozilla::ShutdownPhase::XPCOMWillShutdown); + } else if (strcmp(aTopic, "terminator-test-xpcom-shutdown") == 0) { + AdvancePhase(mozilla::ShutdownPhase::XPCOMShutdown); + } else if (strcmp(aTopic, "terminator-test-xpcom-shutdown-threads") == 0) { + AdvancePhase(mozilla::ShutdownPhase::XPCOMShutdownThreads); + } else if (strcmp(aTopic, "terminator-test-XPCOMShutdownFinal") == 0) { + AdvancePhase(mozilla::ShutdownPhase::XPCOMShutdownFinal); + } else if (strcmp(aTopic, "terminator-test-CCPostLastCycleCollection") == 0) { + AdvancePhase(mozilla::ShutdownPhase::CCPostLastCycleCollection); + } + + return NS_OK; +} + +// Prepare, allocate and start the watchdog thread. +// By design, it will never finish, nor be deallocated. +void nsTerminator::StartWatchdog() { + int32_t crashAfterMS = + Preferences::GetInt("toolkit.asyncshutdown.crash_timeout", + FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS); + // Ignore negative values + if (crashAfterMS <= 0) { + crashAfterMS = FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS; + } + + // Add a little padding, to ensure that we do not crash before + // AsyncShutdown. + if (crashAfterMS > INT32_MAX - ADDITIONAL_WAIT_BEFORE_CRASH_MS) { + // Defend against overflow + crashAfterMS = INT32_MAX; + } else { + crashAfterMS += ADDITIONAL_WAIT_BEFORE_CRASH_MS; + } + +#ifdef MOZ_VALGRIND + // If we're running on Valgrind, we'll be making forward progress at a + // rate of somewhere between 1/25th and 1/50th of normal. This can cause + // timeouts frequently enough to be a problem for the Valgrind runs on + // automation: see bug 1296819. As an attempt to avoid the worst of this, + // scale up the presented timeout by a factor of three. For a + // non-Valgrind-enabled build, or for an enabled build which isn't running + // on Valgrind, the timeout is unchanged. + if (RUNNING_ON_VALGRIND) { + const int32_t scaleUp = 3; + if (crashAfterMS >= (INT32_MAX / scaleUp) - 1) { + // Defend against overflow + crashAfterMS = INT32_MAX; + } else { + crashAfterMS *= scaleUp; + } + } +#endif + + UniquePtr<Options> options(new Options()); + // crashAfterTicks is guaranteed to be > 0 as + // crashAfterMS >= ADDITIONAL_WAIT_BEFORE_CRASH_MS >> HEARTBEAT_INTERVAL_MS + options->crashAfterTicks = crashAfterMS / HEARTBEAT_INTERVAL_MS; + + DebugOnly<PRThread*> watchdogThread = + CreateSystemThread(RunWatchdog, options.release()); + MOZ_ASSERT(watchdogThread); +} + +// Prepare, allocate and start the writer thread. By design, it will never +// finish, nor be deallocated. In case of error, we degrade +// gracefully to not writing Telemetry data. +void nsTerminator::StartWriter() { + if (!Telemetry::CanRecordExtended()) { + return; + } + nsCOMPtr<nsIFile> profLD; + nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_LOCAL_50_DIR, + getter_AddRefs(profLD)); + if (NS_FAILED(rv)) { + return; + } + + rv = profLD->Append(u"ShutdownDuration.json"_ns); + if (NS_FAILED(rv)) { + return; + } + + nsAutoString path; + rv = profLD->GetPath(path); + if (NS_FAILED(rv)) { + return; + } + + gWriteReady = PR_NewMonitor(); + MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT( + gWriteReady); // We will never deallocate this object + PRThread* writerThread = CreateSystemThread(RunWriter, ToNewUTF8String(path)); + + if (!writerThread) { + return; + } +} + +// This helper is here to preserve the existing crash reporting behavior +// based on observer topic names, using the shutdown phase name only for +// phases without associated topic. +const char* GetReadableNameForPhase(mozilla::ShutdownPhase aPhase) { + const char* readableName = mozilla::AppShutdown::GetObserverKey(aPhase); + if (!readableName) { + readableName = mozilla::AppShutdown::GetShutdownPhaseName(aPhase); + } + return readableName; +} + +void nsTerminator::AdvancePhase(mozilla::ShutdownPhase aPhase) { + // If the phase is unknown, just ignore it. + auto step = GetStepForPhase(aPhase); + if (step < 0) { + return; + } + + // As we have seen examples in the wild of shutdown notifications + // not being sent (or not being sent in the expected order), we do + // not assume a specific order. + if (!mInitialized) { + Start(); + } + + UpdateHeartbeat(step); +#if !defined(NS_FREE_PERMANENT_DATA) + // Only allow nsTerminator to write on non-leak checked builds so we don't get + // leak warnings on shutdown for intentional leaks (see bug 1242084). This + // will be enabled again by bug 1255484 when 1255478 lands. + UpdateTelemetry(); +#endif // !defined(NS_FREE_PERMANENT_DATA) + UpdateCrashReport(GetReadableNameForPhase(aPhase)); +} + +void nsTerminator::UpdateHeartbeat(int32_t aStep) { + MOZ_ASSERT(aStep >= mCurrentStep); + + if (aStep > mCurrentStep) { + // Reset the clock, find out how long the current phase has lasted. + uint32_t ticks = gHeartbeat.exchange(0); + if (mCurrentStep >= 0) { + sShutdownSteps[mCurrentStep].mTicks = ticks; + } + sShutdownSteps[aStep].mTicks = 0; + + mCurrentStep = aStep; + } +} + +void nsTerminator::UpdateTelemetry() { + if (!Telemetry::CanRecordExtended() || !gWriteReady) { + return; + } + + // + // We need Telemetry data on the effective duration of each step, + // to be able to tune the time-to-crash of each of both the + // Terminator and AsyncShutdown. However, at this stage, it is too + // late to record such data into Telemetry, so we write it to disk + // and read it upon the next startup. + // + + // Build JSON. + UniquePtr<nsCString> telemetryData(new nsCString()); + telemetryData->AppendLiteral("{"); + size_t fields = 0; + for (auto& shutdownStep : sShutdownSteps) { + if (shutdownStep.mTicks < 0) { + // Ignore this field. + continue; + } + if (fields++ > 0) { + telemetryData->AppendLiteral(", "); + } + telemetryData->AppendLiteral(R"(")"); + telemetryData->Append(GetReadableNameForPhase(shutdownStep.mPhase)); + telemetryData->AppendLiteral(R"(": )"); + telemetryData->AppendInt(shutdownStep.mTicks); + } + telemetryData->AppendLiteral("}"); + + if (fields == 0) { + // Nothing to write + return; + } + + // + // Send data to the worker thread. + // + delete gWriteData.exchange( + telemetryData.release()); // Clear any data that hasn't been written yet + + // In case the worker thread was sleeping, wake it up. + PR_EnterMonitor(gWriteReady); + PR_Notify(gWriteReady); + PR_ExitMonitor(gWriteReady); +} + +void nsTerminator::UpdateCrashReport(const char* aTopic) { + // In case of crash, we wish to know where in shutdown we are + nsAutoCString report(aTopic); + + Unused << CrashReporter::AnnotateCrashReport( + CrashReporter::Annotation::ShutdownProgress, report); +} +} // namespace mozilla diff --git a/toolkit/components/terminator/nsTerminator.h b/toolkit/components/terminator/nsTerminator.h new file mode 100644 index 0000000000..46c3e17b92 --- /dev/null +++ b/toolkit/components/terminator/nsTerminator.h @@ -0,0 +1,49 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: sw=2 ts=2 et lcs=trail\:.,tab\:>~ : + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsTerminator_h__ +#define nsTerminator_h__ + +#include "nsISupports.h" +#include "nsIObserver.h" + +namespace mozilla { + +class nsTerminator final : public nsIObserver { + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIOBSERVER + + nsTerminator(); + void AdvancePhase(mozilla::ShutdownPhase aPhase); + + private: + void Start(); + void StartWatchdog(); + void StartWriter(); + + void UpdateHeartbeat(int aStep); + void UpdateTelemetry(); + void UpdateCrashReport(const char* aTopic); + + ~nsTerminator() = default; + + bool mInitialized; + int mCurrentStep; +}; + +} // namespace mozilla + +#define NS_TOOLKIT_TERMINATOR_CID \ + { \ + 0x2e59cc70, 0xf83a, 0x412f, { \ + 0x89, 0xd4, 0x45, 0x38, 0x85, 0x83, 0x72, 0x17 \ + } \ + } +#define NS_TOOLKIT_TERMINATOR_CONTRACTID \ + "@mozilla.org/toolkit/shutdown-terminator;1" + +#endif // nsTerminator_h__ diff --git a/toolkit/components/terminator/terminator.manifest b/toolkit/components/terminator/terminator.manifest new file mode 100644 index 0000000000..33cb143de9 --- /dev/null +++ b/toolkit/components/terminator/terminator.manifest @@ -0,0 +1 @@ +category profile-after-change nsTerminator @mozilla.org/toolkit/shutdown-terminator;1 diff --git a/toolkit/components/terminator/tests/xpcshell/test_terminator_record.js b/toolkit/components/terminator/tests/xpcshell/test_terminator_record.js new file mode 100644 index 0000000000..62cee636f3 --- /dev/null +++ b/toolkit/components/terminator/tests/xpcshell/test_terminator_record.js @@ -0,0 +1,171 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ +/* eslint-disable mozilla/no-arbitrary-setTimeout */ + +"use strict"; + +// Test that the Shutdown Terminator records durations correctly + +const { setTimeout } = ChromeUtils.importESModule( + "resource://gre/modules/Timer.sys.mjs" +); + +var PATH; +var PATH_TMP; +var terminator; + +var HEARTBEAT_MS = 100; + +let KEYS = [ + "quit-application", + "profile-change-net-teardown", + "profile-change-teardown", + "profile-before-change", + "profile-before-change-qm", + "xpcom-will-shutdown", + "xpcom-shutdown", + "xpcom-shutdown-threads", + "XPCOMShutdownFinal", + "CCPostLastCycleCollection", +]; + +let DATA = []; +let MeasuredDurations = []; + +add_task(async function init() { + do_get_profile(); + PATH = PathUtils.join(PathUtils.localProfileDir, "ShutdownDuration.json"); + PATH_TMP = PATH + ".tmp"; + + // Initialize the terminator + // (normally, this is done through the manifest file, but xpcshell + // doesn't take them into account). + info("Initializing the Terminator"); + terminator = Cc["@mozilla.org/toolkit/shutdown-terminator;1"].createInstance( + Ci.nsIObserver + ); +}); + +var promiseShutdownDurationData = async function () { + // Wait until PATH exists. + // Timeout if it is never created. + while (true) { + if (await IOUtils.exists(PATH)) { + break; + } + + // Wait just a very short period to not increase measured values. + // Usually the file should appear almost immediately. + await new Promise(resolve => setTimeout(resolve, 50)); + } + + return IOUtils.readJSON(PATH); +}; + +var currentPhase = 0; + +var advancePhase = async function () { + let key = "terminator-test-" + KEYS[currentPhase]; + let msDuration = 200 + HEARTBEAT_MS * currentPhase; + + info("Advancing shutdown phase to " + KEYS[currentPhase]); + terminator.observe(null, key, null); + await new Promise(resolve => setTimeout(resolve, msDuration)); + + let data = await promiseShutdownDurationData(); + + Assert.ok(KEYS[currentPhase] in data, "The file contains the expected key"); + Assert.equal( + Object.keys(data).length, + currentPhase + 1, + "File does not contain more durations than expected" + ); + + DATA[currentPhase] = data; + currentPhase++; + if (currentPhase < KEYS.length) { + return true; + } + return false; +}; + +// This is a timing affected test, as we want to check if the time measurements +// from the terminator are reasonable. Bug 1768795 assumes that they tend to +// be lower than wall-clock, in particular on MacOS, confirmed by the logs on +// intermittent bug 1760094. This is not a big deal for the terminator's +// general functionality (timeouts might just come a little later than +// expected nominally), but it makes testing harder and the transferred +// telemetry data slightly less reliable (shutdowns might appear shorter than +// they really were). So this test is just happy if there is any data that +// is not too long wrt what we expect. If we ever want to fix bug 1768795, +// we can check for a more reasonable lower boundary, too. +add_task(async function test_record() { + info("Collecting duration data for all known phases"); + + let morePhases = true; + while (morePhases) { + let beforeWait = Date.now(); + + morePhases = await advancePhase(); + + await IOUtils.remove(PATH); + await IOUtils.remove(PATH_TMP); + + // We measure the effective time that passed as wall-clock and include all + // file IO overhead as the terminator will do so in its measurement, too. + MeasuredDurations[currentPhase - 1] = Math.floor( + (Date.now() - beforeWait) / HEARTBEAT_MS + ); + } + + Assert.equal(DATA.length, KEYS.length, "We have data for each phase"); + + for (let i = 0; i < KEYS.length; i++) { + let lastDuration = DATA[KEYS.length - 1][KEYS[i]]; + Assert.equal( + typeof lastDuration, + "number", + "Duration of phase " + i + ":" + KEYS[i] + " is a number" + ); + + // The durations are only meaningful after we advanced to the next phase. + if (i < KEYS.length - 1) { + // So we read it from the data written for the following phase. + let ticksDuration = DATA[i + 1][KEYS[i]]; + let measuredDuration = MeasuredDurations[i]; + info( + "measuredDuration:" + measuredDuration + " - " + typeof measuredDuration + ); + Assert.lessOrEqual( + ticksDuration, + measuredDuration + 2, + "Duration of phase " + i + ":" + KEYS[i] + " is not too long" + ); + Assert.greaterOrEqual( + ticksDuration, + 0, // TODO: Raise the lower boundary after bug 1768795. + "Duration of phase " + i + ":" + KEYS[i] + " is not too short" + ); + } + // This check is done only for phases <= xpcom-shutdown-threads + // where we have two data points. + if (i < KEYS.length - 2) { + let ticksDuration = DATA[i + 1][KEYS[i]]; + Assert.equal( + ticksDuration, + DATA[KEYS.length - 1][KEYS[i]], + "Duration of phase " + i + ":" + KEYS[i] + " hasn't changed" + ); + } + } + + // Note that after this check the KEYS array remains sorted, so this + // must be the last check to not get confused. + Assert.equal( + Object.keys(DATA[KEYS.length - 1]) + .sort() + .join(", "), + KEYS.sort().join(", "), + "The last file contains all expected keys" + ); +}); diff --git a/toolkit/components/terminator/tests/xpcshell/test_terminator_reload.js b/toolkit/components/terminator/tests/xpcshell/test_terminator_reload.js new file mode 100644 index 0000000000..79aecd818b --- /dev/null +++ b/toolkit/components/terminator/tests/xpcshell/test_terminator_reload.js @@ -0,0 +1,88 @@ +/* Any copyright is dedicated to the Public Domain. + * http://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +// Test that the Shutdown Terminator reloads durations correctly + +const HISTOGRAMS = { + "quit-application": "SHUTDOWN_PHASE_DURATION_TICKS_QUIT_APPLICATION", + "profile-change-net-teardown": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_CHANGE_NET_TEARDOWN", + "profile-change-teardown": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_CHANGE_TEARDOWN", + "profile-before-change": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_BEFORE_CHANGE", + "profile-before-change-qm": + "SHUTDOWN_PHASE_DURATION_TICKS_PROFILE_BEFORE_CHANGE_QM", + "xpcom-will-shutdown": "SHUTDOWN_PHASE_DURATION_TICKS_XPCOM_WILL_SHUTDOWN", + "xpcom-shutdown": "SHUTDOWN_PHASE_DURATION_TICKS_XPCOM_SHUTDOWN", +}; + +let PATH; + +add_setup(async function init() { + do_get_profile(); + PATH = PathUtils.join(PathUtils.localProfileDir, "ShutdownDuration.json"); +}); + +add_task(async function test_reload() { + info("Forging data"); + let data = {}; + let telemetrySnapshots = Services.telemetry.getSnapshotForHistograms( + "main", + false /* clear */ + ).parent; + let i = 0; + for (let k of Object.keys(HISTOGRAMS)) { + let id = HISTOGRAMS[k]; + data[k] = i++; + Assert.equal( + telemetrySnapshots[id] || undefined, + undefined, + "Histogram " + id + " is empty" + ); + } + + // Extra fields that nsTerminator reports that we do not have histograms for. + data["xpcom-shutdown-threads"] = 123; + data.XPCOMShutdownFinal = 456; + data.CCPostLastCycleCollection = 789; + + await IOUtils.writeJSON(PATH, data); + + const TOPIC = "shutdown-terminator-telemetry-updated"; + + let wait = new Promise(resolve => + Services.obs.addObserver(function observer() { + info("Telemetry has been updated"); + Services.obs.removeObserver(observer, TOPIC); + resolve(); + }, TOPIC) + ); + + info("Starting nsTerminatorTelemetry"); + let tt = Cc[ + "@mozilla.org/toolkit/shutdown-terminator-telemetry;1" + ].createInstance(Ci.nsIObserver); + tt.observe(null, "profile-after-change", ""); + + info("Waiting until telemetry is updated"); + // Now wait until Telemetry is updated + await wait; + + telemetrySnapshots = Services.telemetry.getSnapshotForHistograms( + "main", + false /* clear */ + ).parent; + for (let k of Object.keys(HISTOGRAMS)) { + let id = HISTOGRAMS[k]; + info("Testing histogram " + id); + let snapshot = telemetrySnapshots[id]; + let count = 0; + for (let x of Object.values(snapshot.values)) { + count += x; + } + Assert.equal(count, 1, "We have added one item"); + } +}); diff --git a/toolkit/components/terminator/tests/xpcshell/xpcshell.toml b/toolkit/components/terminator/tests/xpcshell/xpcshell.toml new file mode 100644 index 0000000000..cb328ca2c8 --- /dev/null +++ b/toolkit/components/terminator/tests/xpcshell/xpcshell.toml @@ -0,0 +1,15 @@ +[DEFAULT] +head = "" + +["test_terminator_record.js"] +skip-if = [ + "debug", + "asan", # Disabled by bug 1242084, bug 1255484 will enable it again + "ccov", # Bug 1607583 tracks the ccov failure + "tsan", # Bug 1683730 made this timeout for tsan. +] +run-sequentially = "very high failure rate in parallel" + +["test_terminator_reload.js"] +skip-if = ["os == 'android'"] +run-sequentially = "very high failure rate in parallel" |