diff options
Diffstat (limited to 'daemon')
-rw-r--r-- | daemon/Makefile.am | 2 | ||||
-rw-r--r-- | daemon/global_statistics.c | 90 | ||||
-rw-r--r-- | daemon/main.c | 36 | ||||
-rw-r--r-- | daemon/main.h | 2 | ||||
-rw-r--r-- | daemon/signals.c | 103 |
5 files changed, 194 insertions, 39 deletions
diff --git a/daemon/Makefile.am b/daemon/Makefile.am index e020e517..ee1b53d0 100644 --- a/daemon/Makefile.am +++ b/daemon/Makefile.am @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-3.0-or-later AUTOMAKE_OPTIONS = subdir-objects -MAINTAINERCLEANFILES= $(srcdir)/Makefile.in +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in CLEANFILES = \ anonymous-statistics.sh \ $(NULL) diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c index 53b7546f..2bcc5c9f 100644 --- a/daemon/global_statistics.c +++ b/daemon/global_statistics.c @@ -534,11 +534,30 @@ void global_statistics_charts(void) { // ---------------------------------------------------------------- #ifdef ENABLE_DBENGINE - if (localhost->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { - unsigned long long stats_array[RRDENG_NR_STATS]; + RRDHOST *host; + unsigned long long stats_array[RRDENG_NR_STATS] = {0}; + unsigned long long local_stats_array[RRDENG_NR_STATS]; + unsigned hosts_with_dbengine = 0, i; + + rrd_rdlock(); + rrdhost_foreach_read(host) { + if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) { + ++hosts_with_dbengine; + /* get localhost's DB engine's statistics */ + rrdeng_get_33_statistics(host->rrdeng_ctx, local_stats_array); + for (i = 0 ; i < RRDENG_NR_STATS ; ++i) { + /* aggregate statistics across hosts */ + stats_array[i] += local_stats_array[i]; + } + } + } + rrd_unlock(); - /* get localhost's DB engine's statistics */ - rrdeng_get_33_statistics(localhost->rrdeng_ctx, stats_array); + if (hosts_with_dbengine) { + /* deduplicate global statistics by getting the ones from the last host */ + stats_array[30] = local_stats_array[30]; + stats_array[31] = local_stats_array[31]; + stats_array[32] = local_stats_array[32]; // ---------------------------------------------------------------- @@ -639,7 +658,7 @@ void global_statistics_charts(void) { static RRDSET *st_pg_cache_pages = NULL; static RRDDIM *rd_descriptors = NULL; static RRDDIM *rd_populated = NULL; - static RRDDIM *rd_commited = NULL; + static RRDDIM *rd_committed = NULL; static RRDDIM *rd_insertions = NULL; static RRDDIM *rd_deletions = NULL; static RRDDIM *rd_backfills = NULL; @@ -663,7 +682,7 @@ void global_statistics_charts(void) { rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); - rd_commited = rrddim_add(st_pg_cache_pages, "commited", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); + rd_committed = rrddim_add(st_pg_cache_pages, "committed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE); rd_insertions = rrddim_add(st_pg_cache_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); rd_deletions = rrddim_add(st_pg_cache_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL); rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL); @@ -674,7 +693,7 @@ void global_statistics_charts(void) { rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]); rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]); - rrddim_set_by_pointer(st_pg_cache_pages, rd_commited, (collected_number)stats_array[4]); + rrddim_set_by_pointer(st_pg_cache_pages, rd_committed, (collected_number)stats_array[4]); rrddim_set_by_pointer(st_pg_cache_pages, rd_insertions, (collected_number)stats_array[5]); rrddim_set_by_pointer(st_pg_cache_pages, rd_deletions, (collected_number)stats_array[6]); rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]); @@ -818,6 +837,63 @@ void global_statistics_charts(void) { rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4); rrdset_done(st_fd); } + + // ---------------------------------------------------------------- + + { + static RRDSET *st_ram_usage = NULL; + static RRDDIM *rd_cached = NULL; + static RRDDIM *rd_pinned = NULL; + static RRDDIM *rd_metadata = NULL; + + collected_number cached_pages, pinned_pages, API_producers, populated_pages, metadata, pages_on_disk, + page_cache_descriptors; + + if (unlikely(!st_ram_usage)) { + st_ram_usage = rrdset_create_localhost( + "netdata" + , "dbengine_ram" + , NULL + , "dbengine" + , NULL + , "NetData DB engine RAM usage" + , "MiB" + , "netdata" + , "stats" + , 130509 + , localhost->rrd_update_every + , RRDSET_TYPE_STACKED + ); + + rd_cached = rrddim_add(st_ram_usage, "cache", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE); + rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE); + rd_metadata = rrddim_add(st_ram_usage, "metadata", NULL, 1, 1048576, RRD_ALGORITHM_ABSOLUTE); + } + else + rrdset_next(st_ram_usage); + + API_producers = (collected_number)stats_array[0]; + pages_on_disk = (collected_number)stats_array[2]; + populated_pages = (collected_number)stats_array[3]; + page_cache_descriptors = (collected_number)stats_array[27]; + + if (API_producers * 2 > populated_pages) { + pinned_pages = API_producers; + } else{ + pinned_pages = API_producers * 2; + } + cached_pages = populated_pages - pinned_pages; + + metadata = page_cache_descriptors * sizeof(struct page_cache_descr); + metadata += pages_on_disk * sizeof(struct rrdeng_page_descr); + /* This is an empirical estimation for Judy array indexing and extent structures */ + metadata += pages_on_disk * 58; + + rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages); + rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages); + rrddim_set_by_pointer(st_ram_usage, rd_metadata, metadata); + rrdset_done(st_ram_usage); + } } #endif diff --git a/daemon/main.c b/daemon/main.c index 4189ac7b..0e56654d 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -146,46 +146,28 @@ void web_server_config_options(void) { } -int killpid(pid_t pid, int signal) -{ - int ret = -1; +// killpid kills pid with SIGTERM. +int killpid(pid_t pid) { + int ret; debug(D_EXIT, "Request to kill pid %d", pid); errno = 0; - if(kill(pid, 0) == -1) { + ret = kill(pid, SIGTERM); + if (ret == -1) { switch(errno) { case ESRCH: - error("Request to kill pid %d, but it is not running.", pid); - break; + // We wanted the process to exit so just let the caller handle. + return ret; case EPERM: - error("Request to kill pid %d, but I do not have enough permissions.", pid); + error("Cannot kill pid %d, but I do not have enough permissions.", pid); break; default: - error("Request to kill pid %d, but I received an error.", pid); + error("Cannot kill pid %d, but I received an error.", pid); break; } } - else { - errno = 0; - ret = kill(pid, signal); - if(ret == -1) { - switch(errno) { - case ESRCH: - error("Cannot kill pid %d, but it is not running.", pid); - break; - - case EPERM: - error("Cannot kill pid %d, but I do not have enough permissions.", pid); - break; - - default: - error("Cannot kill pid %d, but I received an error.", pid); - break; - } - } - } return ret; } diff --git a/daemon/main.h b/daemon/main.h index 68715598..9d9f4ef0 100644 --- a/daemon/main.h +++ b/daemon/main.h @@ -41,7 +41,7 @@ struct netdata_static_thread { }; extern void cancel_main_threads(void); -extern int killpid(pid_t pid, int signal); +extern int killpid(pid_t pid); extern void netdata_cleanup_and_exit(int ret) NORETURN; extern void send_statistics(const char *action, const char *action_result, const char *action_data); diff --git a/daemon/signals.c b/daemon/signals.c index 71f27188..5378b04e 100644 --- a/daemon/signals.c +++ b/daemon/signals.c @@ -2,6 +2,8 @@ #include "common.h" +static int reaper_enabled = 0; + typedef enum signal_action { NETDATA_SIGNAL_END_OF_LIST, NETDATA_SIGNAL_IGNORE, @@ -10,6 +12,7 @@ typedef enum signal_action { NETDATA_SIGNAL_LOG_ROTATE, NETDATA_SIGNAL_RELOAD_HEALTH, NETDATA_SIGNAL_FATAL, + NETDATA_SIGNAL_CHILD, } SIGNAL_ACTION; static struct { @@ -26,6 +29,7 @@ static struct { { SIGUSR1, "SIGUSR1", 0, NETDATA_SIGNAL_SAVE_DATABASE }, { SIGUSR2, "SIGUSR2", 0, NETDATA_SIGNAL_RELOAD_HEALTH }, { SIGBUS, "SIGBUS", 0, NETDATA_SIGNAL_FATAL }, + { SIGCHLD, "SIGCHLD", 0, NETDATA_SIGNAL_CHILD }, // terminator { 0, "NONE", 0, NETDATA_SIGNAL_END_OF_LIST } @@ -42,7 +46,7 @@ static void signal_handler(int signo) { char buffer[200 + 1]; snprintfz(buffer, 200, "\nSIGNAL HANLDER: received: %s. Oops! This is bad!\n", signals_waiting[i].name); if(write(STDERR_FILENO, buffer, strlen(buffer)) == -1) { - // nothing to do - we cannot write but there is no way to complaint about it + // nothing to do - we cannot write but there is no way to complain about it ; } } @@ -74,15 +78,33 @@ void signals_init(void) { struct sigaction sa; sa.sa_flags = 0; + // Enable process tracking / reaper if running as init (pid == 1). + // This prevents zombie processes when running in a container. + if (getpid() == 1) { + info("SIGNAL: Enabling reaper"); + myp_init(); + reaper_enabled = 1; + } else { + info("SIGNAL: Not enabling reaper"); + } + // ignore all signals while we run in a signal handler sigfillset(&sa.sa_mask); int i; for (i = 0; signals_waiting[i].action != NETDATA_SIGNAL_END_OF_LIST; i++) { - if(signals_waiting[i].action == NETDATA_SIGNAL_IGNORE) + switch (signals_waiting[i].action) { + case NETDATA_SIGNAL_IGNORE: sa.sa_handler = SIG_IGN; - else + break; + case NETDATA_SIGNAL_CHILD: + if (reaper_enabled == 0) + continue; + // FALLTHROUGH + default: sa.sa_handler = signal_handler; + break; + } if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1) error("SIGNAL: Failed to change signal handler for: %s", signals_waiting[i].name); @@ -100,6 +122,76 @@ void signals_reset(void) { if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1) error("SIGNAL: Failed to reset signal handler for: %s", signals_waiting[i].name); } + + if (reaper_enabled == 1) + myp_free(); +} + +// reap_child reaps the child identified by pid. +static void reap_child(pid_t pid) { + siginfo_t i; + + errno = 0; + debug(D_CHILDS, "SIGNAL: Reaping pid: %d...", pid); + if (waitid(P_PID, (id_t)pid, &i, WEXITED|WNOHANG) == -1) { + if (errno != ECHILD) + error("SIGNAL: Failed to wait for: %d", pid); + else + debug(D_CHILDS, "SIGNAL: Already reaped: %d", pid); + return; + } else if (i.si_pid == 0) { + // Process didn't exit, this shouldn't happen. + return; + } + + switch (i.si_code) { + case CLD_EXITED: + debug(D_CHILDS, "SIGNAL: Child %d exited: %d", pid, i.si_status); + break; + case CLD_KILLED: + debug(D_CHILDS, "SIGNAL: Child %d killed by signal: %d", pid, i.si_status); + break; + case CLD_DUMPED: + debug(D_CHILDS, "SIGNAL: Child %d dumped core by signal: %d", pid, i.si_status); + break; + case CLD_STOPPED: + debug(D_CHILDS, "SIGNAL: Child %d stopped by signal: %d", pid, i.si_status); + break; + case CLD_TRAPPED: + debug(D_CHILDS, "SIGNAL: Child %d trapped by signal: %d", pid, i.si_status); + break; + case CLD_CONTINUED: + debug(D_CHILDS, "SIGNAL: Child %d continued by signal: %d", pid, i.si_status); + break; + default: + debug(D_CHILDS, "SIGNAL: Child %d gave us a SIGCHLD with code %d and status %d.", pid, i.si_code, i.si_status); + } +} + +// reap_children reaps all pending children which are not managed by myp. +static void reap_children() { + siginfo_t i; + + while (1 == 1) { + // Identify which process caused the signal so we can determine + // if we need to reap a re-parented process. + i.si_pid = 0; + if (waitid(P_ALL, (id_t)0, &i, WEXITED|WNOHANG|WNOWAIT) == -1) { + if (errno != ECHILD) // This shouldn't happen with WNOHANG but does. + error("SIGNAL: Failed to wait"); + return; + } else if (i.si_pid == 0) { + // No child exited. + return; + } else if (myp_reap(i.si_pid) == 0) { + // myp managed, sleep for a short time to avoid busy wait while + // this is handled by myp. + usleep(10000); + } else { + // Unknown process, likely a re-parented child, reap it. + reap_child(i.si_pid); + } + } } void signals_handle(void) { @@ -157,6 +249,11 @@ void signals_handle(void) { case NETDATA_SIGNAL_FATAL: fatal("SIGNAL: Received %s. netdata now exits.", name); + case NETDATA_SIGNAL_CHILD: + debug(D_CHILDS, "SIGNAL: Received %s. Reaping...", name); + reap_children(); + break; + default: info("SIGNAL: Received %s. No signal handler configured. Ignoring it.", name); break; |