summaryrefslogtreecommitdiffstats
path: root/src/daemon/watcher.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/daemon/watcher.c')
-rw-r--r--src/daemon/watcher.c178
1 files changed, 178 insertions, 0 deletions
diff --git a/src/daemon/watcher.c b/src/daemon/watcher.c
new file mode 100644
index 000000000..3eea22019
--- /dev/null
+++ b/src/daemon/watcher.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "watcher.h"
+
+watcher_step_t *watcher_steps;
+
+static struct completion shutdown_begin_completion;
+static struct completion shutdown_end_completion;
+static netdata_thread_t watcher_thread;
+
+void watcher_shutdown_begin(void) {
+ completion_mark_complete(&shutdown_begin_completion);
+}
+
+void watcher_shutdown_end(void) {
+ completion_mark_complete(&shutdown_end_completion);
+}
+
+void watcher_step_complete(watcher_step_id_t step_id) {
+ completion_mark_complete(&watcher_steps[step_id].p);
+}
+
+static void watcher_wait_for_step(const watcher_step_id_t step_id)
+{
+ unsigned timeout = 90;
+
+ usec_t step_start_time = now_monotonic_usec();
+
+#ifdef ENABLE_SENTRY
+ // Wait with a timeout
+ bool ok = completion_timedwait_for(&watcher_steps[step_id].p, timeout);
+#else
+ // Wait indefinitely
+ bool ok = true;
+ completion_wait_for(&watcher_steps[step_id].p);
+#endif
+
+ usec_t step_duration = now_monotonic_usec() - step_start_time;
+
+ if (ok) {
+ netdata_log_info("shutdown step: [%d/%d] - '%s' finished in %llu milliseconds",
+ step_id + 1, WATCHER_STEP_ID_MAX,
+ watcher_steps[step_id].msg, step_duration / USEC_PER_MS);
+ } else {
+ // Do not call fatal() because it will try to execute the exit
+ // sequence twice.
+ netdata_log_error("shutdown step: [%d/%d] - '%s' took more than %u seconds (ie. %llu milliseconds)",
+ step_id + 1, WATCHER_STEP_ID_MAX, watcher_steps[step_id].msg,
+ timeout, step_duration / USEC_PER_MS);
+
+ abort();
+ }
+}
+
+void *watcher_main(void *arg)
+{
+ UNUSED(arg);
+
+ netdata_log_debug(D_SYSTEM, "Watcher thread started");
+
+ // wait until the agent starts the shutdown process
+ completion_wait_for(&shutdown_begin_completion);
+ netdata_log_error("Shutdown process started");
+
+ usec_t shutdown_start_time = now_monotonic_usec();
+
+ watcher_wait_for_step(WATCHER_STEP_ID_CREATE_SHUTDOWN_FILE);
+ watcher_wait_for_step(WATCHER_STEP_ID_DBENGINE_EXIT_MODE);
+ watcher_wait_for_step(WATCHER_STEP_ID_CLOSE_WEBRTC_CONNECTIONS);
+ watcher_wait_for_step(WATCHER_STEP_ID_DISABLE_MAINTENANCE_NEW_QUERIES_NEW_WEB_REQUESTS_NEW_STREAMING_CONNECTIONS_AND_ACLK);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_MAINTENANCE_THREAD);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_EXPORTERS_HEALTH_AND_WEB_SERVERS_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_COLLECTORS_AND_STREAMING_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_REPLICATION_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_PREPARE_METASYNC_SHUTDOWN);
+ watcher_wait_for_step(WATCHER_STEP_ID_DISABLE_ML_DETECTION_AND_TRAINING_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_CONTEXT_THREAD);
+ watcher_wait_for_step(WATCHER_STEP_ID_CLEAR_WEB_CLIENT_CACHE);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_ACLK_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_ALL_REMAINING_WORKER_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_CANCEL_MAIN_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
+ watcher_wait_for_step(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH);
+ watcher_wait_for_step(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING);
+ watcher_wait_for_step(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
+ watcher_wait_for_step(WATCHER_STEP_ID_CLOSE_SQL_DATABASES);
+ watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_PID_FILE);
+ watcher_wait_for_step(WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES);
+ watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE);
+
+ completion_wait_for(&shutdown_end_completion);
+ usec_t shutdown_end_time = now_monotonic_usec();
+
+ usec_t shutdown_duration = shutdown_end_time - shutdown_start_time;
+ netdata_log_error("Shutdown process ended in %llu milliseconds",
+ shutdown_duration / USEC_PER_MS);
+
+ return NULL;
+}
+
+void watcher_thread_start() {
+ watcher_steps = callocz(WATCHER_STEP_ID_MAX, sizeof(watcher_step_t));
+
+ watcher_steps[WATCHER_STEP_ID_CREATE_SHUTDOWN_FILE].msg =
+ "create shutdown file";
+ watcher_steps[WATCHER_STEP_ID_DBENGINE_EXIT_MODE].msg =
+ "dbengine exit mode";
+ watcher_steps[WATCHER_STEP_ID_CLOSE_WEBRTC_CONNECTIONS].msg =
+ "close webrtc connections";
+ watcher_steps[WATCHER_STEP_ID_DISABLE_MAINTENANCE_NEW_QUERIES_NEW_WEB_REQUESTS_NEW_STREAMING_CONNECTIONS_AND_ACLK].msg =
+ "disable maintenance, new queries, new web requests, new streaming connections and aclk";
+ watcher_steps[WATCHER_STEP_ID_STOP_MAINTENANCE_THREAD].msg =
+ "stop maintenance thread";
+ watcher_steps[WATCHER_STEP_ID_STOP_EXPORTERS_HEALTH_AND_WEB_SERVERS_THREADS].msg =
+ "stop exporters, health and web servers threads";
+ watcher_steps[WATCHER_STEP_ID_STOP_COLLECTORS_AND_STREAMING_THREADS].msg =
+ "stop collectors and streaming threads";
+ watcher_steps[WATCHER_STEP_ID_STOP_REPLICATION_THREADS].msg =
+ "stop replication threads";
+ watcher_steps[WATCHER_STEP_ID_PREPARE_METASYNC_SHUTDOWN].msg =
+ "prepare metasync shutdown";
+ watcher_steps[WATCHER_STEP_ID_DISABLE_ML_DETECTION_AND_TRAINING_THREADS].msg =
+ "disable ML detection and training threads";
+ watcher_steps[WATCHER_STEP_ID_STOP_CONTEXT_THREAD].msg =
+ "stop context thread";
+ watcher_steps[WATCHER_STEP_ID_CLEAR_WEB_CLIENT_CACHE].msg =
+ "clear web client cache";
+ watcher_steps[WATCHER_STEP_ID_STOP_ACLK_THREADS].msg =
+ "stop aclk threads";
+ watcher_steps[WATCHER_STEP_ID_STOP_ALL_REMAINING_WORKER_THREADS].msg =
+ "stop all remaining worker threads";
+ watcher_steps[WATCHER_STEP_ID_CANCEL_MAIN_THREADS].msg =
+ "cancel main threads";
+ watcher_steps[WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS].msg =
+ "flush dbengine tiers";
+ watcher_steps[WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS].msg =
+ "stop collection for all hosts";
+ watcher_steps[WATCHER_STEP_ID_STOP_METASYNC_THREADS].msg =
+ "stop metasync threads";
+ watcher_steps[WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH].msg =
+ "wait for dbengine collectors to finish";
+ watcher_steps[WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING].msg =
+ "wait for dbengine main cache to finish flushing";
+ watcher_steps[WATCHER_STEP_ID_STOP_DBENGINE_TIERS].msg =
+ "stop dbengine tiers";
+ watcher_steps[WATCHER_STEP_ID_CLOSE_SQL_DATABASES].msg =
+ "close SQL databases";
+ watcher_steps[WATCHER_STEP_ID_REMOVE_PID_FILE].msg =
+ "remove pid file";
+ watcher_steps[WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES].msg =
+ "free openssl structures";
+ watcher_steps[WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE].msg =
+ "remove incomplete shutdown file";
+
+ for (size_t i = 0; i != WATCHER_STEP_ID_MAX; i++) {
+ completion_init(&watcher_steps[i].p);
+ }
+
+ completion_init(&shutdown_begin_completion);
+ completion_init(&shutdown_end_completion);
+
+ netdata_thread_create(&watcher_thread, "P[WATCHER]", NETDATA_THREAD_OPTION_JOINABLE, watcher_main, NULL);
+}
+
+void watcher_thread_stop() {
+ netdata_thread_join(watcher_thread, NULL);
+
+ for (size_t i = 0; i != WATCHER_STEP_ID_MAX; i++) {
+ completion_destroy(&watcher_steps[i].p);
+ }
+
+ completion_destroy(&shutdown_begin_completion);
+ completion_destroy(&shutdown_end_completion);
+
+ freez(watcher_steps);
+}