diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-07-24 09:54:23 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-07-24 09:54:44 +0000 |
commit | 836b47cb7e99a977c5a23b059ca1d0b5065d310e (patch) | |
tree | 1604da8f482d02effa033c94a84be42bc0c848c3 /src/daemon/watcher.c | |
parent | Releasing debian version 1.44.3-2. (diff) | |
download | netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.tar.xz netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.zip |
Merging upstream version 1.46.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/daemon/watcher.c')
-rw-r--r-- | src/daemon/watcher.c | 178 |
1 files changed, 178 insertions, 0 deletions
diff --git a/src/daemon/watcher.c b/src/daemon/watcher.c new file mode 100644 index 000000000..1e0090e24 --- /dev/null +++ b/src/daemon/watcher.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "watcher.h" + +watcher_step_t *watcher_steps; + +static struct completion shutdown_begin_completion; +static struct completion shutdown_end_completion; +static ND_THREAD *watcher_thread; + +void watcher_shutdown_begin(void) { + completion_mark_complete(&shutdown_begin_completion); +} + +void watcher_shutdown_end(void) { + completion_mark_complete(&shutdown_end_completion); +} + +void watcher_step_complete(watcher_step_id_t step_id) { + completion_mark_complete(&watcher_steps[step_id].p); +} + +static void watcher_wait_for_step(const watcher_step_id_t step_id) +{ + unsigned timeout = 90; + + usec_t step_start_time = now_monotonic_usec(); + +#ifdef ENABLE_SENTRY + // Wait with a timeout + bool ok = completion_timedwait_for(&watcher_steps[step_id].p, timeout); +#else + // Wait indefinitely + bool ok = true; + completion_wait_for(&watcher_steps[step_id].p); +#endif + + usec_t step_duration = now_monotonic_usec() - step_start_time; + + if (ok) { + netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds", + (int)step_id + 1, (int)WATCHER_STEP_ID_MAX, + watcher_steps[step_id].msg, step_duration / USEC_PER_MS); + } else { + // Do not call fatal() because it will try to execute the exit + // sequence twice. + netdata_log_error("shutdown step: [%d/%d] - '%s' took more than %u seconds (ie. %llu milliseconds)", + (int)step_id + 1, (int)WATCHER_STEP_ID_MAX, watcher_steps[step_id].msg, + timeout, step_duration / USEC_PER_MS); + + abort(); + } +} + +void *watcher_main(void *arg) +{ + UNUSED(arg); + + netdata_log_debug(D_SYSTEM, "Watcher thread started"); + + // wait until the agent starts the shutdown process + completion_wait_for(&shutdown_begin_completion); + netdata_log_error("Shutdown process started"); + + usec_t shutdown_start_time = now_monotonic_usec(); + + watcher_wait_for_step(WATCHER_STEP_ID_CREATE_SHUTDOWN_FILE); + watcher_wait_for_step(WATCHER_STEP_ID_DBENGINE_EXIT_MODE); + watcher_wait_for_step(WATCHER_STEP_ID_CLOSE_WEBRTC_CONNECTIONS); + watcher_wait_for_step(WATCHER_STEP_ID_DISABLE_MAINTENANCE_NEW_QUERIES_NEW_WEB_REQUESTS_NEW_STREAMING_CONNECTIONS_AND_ACLK); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_MAINTENANCE_THREAD); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_EXPORTERS_HEALTH_AND_WEB_SERVERS_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_COLLECTORS_AND_STREAMING_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_REPLICATION_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_PREPARE_METASYNC_SHUTDOWN); + watcher_wait_for_step(WATCHER_STEP_ID_DISABLE_ML_DETECTION_AND_TRAINING_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_CONTEXT_THREAD); + watcher_wait_for_step(WATCHER_STEP_ID_CLEAR_WEB_CLIENT_CACHE); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_ACLK_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_ALL_REMAINING_WORKER_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_CANCEL_MAIN_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_METASYNC_THREADS); + watcher_wait_for_step(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH); + watcher_wait_for_step(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING); + watcher_wait_for_step(WATCHER_STEP_ID_STOP_DBENGINE_TIERS); + watcher_wait_for_step(WATCHER_STEP_ID_CLOSE_SQL_DATABASES); + watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_PID_FILE); + watcher_wait_for_step(WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES); + watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE); + + completion_wait_for(&shutdown_end_completion); + usec_t shutdown_end_time = now_monotonic_usec(); + + usec_t shutdown_duration = shutdown_end_time - shutdown_start_time; + netdata_log_error("Shutdown process ended in %llu milliseconds", + shutdown_duration / USEC_PER_MS); + + return NULL; +} + +void watcher_thread_start() { + watcher_steps = callocz(WATCHER_STEP_ID_MAX, sizeof(watcher_step_t)); + + watcher_steps[WATCHER_STEP_ID_CREATE_SHUTDOWN_FILE].msg = + "create shutdown file"; + watcher_steps[WATCHER_STEP_ID_DBENGINE_EXIT_MODE].msg = + "dbengine exit mode"; + watcher_steps[WATCHER_STEP_ID_CLOSE_WEBRTC_CONNECTIONS].msg = + "close webrtc connections"; + watcher_steps[WATCHER_STEP_ID_DISABLE_MAINTENANCE_NEW_QUERIES_NEW_WEB_REQUESTS_NEW_STREAMING_CONNECTIONS_AND_ACLK].msg = + "disable maintenance, new queries, new web requests, new streaming connections and aclk"; + watcher_steps[WATCHER_STEP_ID_STOP_MAINTENANCE_THREAD].msg = + "stop maintenance thread"; + watcher_steps[WATCHER_STEP_ID_STOP_EXPORTERS_HEALTH_AND_WEB_SERVERS_THREADS].msg = + "stop exporters, health and web servers threads"; + watcher_steps[WATCHER_STEP_ID_STOP_COLLECTORS_AND_STREAMING_THREADS].msg = + "stop collectors and streaming threads"; + watcher_steps[WATCHER_STEP_ID_STOP_REPLICATION_THREADS].msg = + "stop replication threads"; + watcher_steps[WATCHER_STEP_ID_PREPARE_METASYNC_SHUTDOWN].msg = + "prepare metasync shutdown"; + watcher_steps[WATCHER_STEP_ID_DISABLE_ML_DETECTION_AND_TRAINING_THREADS].msg = + "disable ML detection and training threads"; + watcher_steps[WATCHER_STEP_ID_STOP_CONTEXT_THREAD].msg = + "stop context thread"; + watcher_steps[WATCHER_STEP_ID_CLEAR_WEB_CLIENT_CACHE].msg = + "clear web client cache"; + watcher_steps[WATCHER_STEP_ID_STOP_ACLK_THREADS].msg = + "stop aclk threads"; + watcher_steps[WATCHER_STEP_ID_STOP_ALL_REMAINING_WORKER_THREADS].msg = + "stop all remaining worker threads"; + watcher_steps[WATCHER_STEP_ID_CANCEL_MAIN_THREADS].msg = + "cancel main threads"; + watcher_steps[WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS].msg = + "flush dbengine tiers"; + watcher_steps[WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS].msg = + "stop collection for all hosts"; + watcher_steps[WATCHER_STEP_ID_STOP_METASYNC_THREADS].msg = + "stop metasync threads"; + watcher_steps[WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH].msg = + "wait for dbengine collectors to finish"; + watcher_steps[WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING].msg = + "wait for dbengine main cache to finish flushing"; + watcher_steps[WATCHER_STEP_ID_STOP_DBENGINE_TIERS].msg = + "stop dbengine tiers"; + watcher_steps[WATCHER_STEP_ID_CLOSE_SQL_DATABASES].msg = + "close SQL databases"; + watcher_steps[WATCHER_STEP_ID_REMOVE_PID_FILE].msg = + "remove pid file"; + watcher_steps[WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES].msg = + "free openssl structures"; + watcher_steps[WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE].msg = + "remove incomplete shutdown file"; + + for (size_t i = 0; i != WATCHER_STEP_ID_MAX; i++) { + completion_init(&watcher_steps[i].p); + } + + completion_init(&shutdown_begin_completion); + completion_init(&shutdown_end_completion); + + watcher_thread = nd_thread_create("P[WATCHER]", NETDATA_THREAD_OPTION_JOINABLE, watcher_main, NULL); +} + +void watcher_thread_stop() { + nd_thread_join(watcher_thread); + + for (size_t i = 0; i != WATCHER_STEP_ID_MAX; i++) { + completion_destroy(&watcher_steps[i].p); + } + + completion_destroy(&shutdown_begin_completion); + completion_destroy(&shutdown_end_completion); + + freez(watcher_steps); +} |