summaryrefslogtreecommitdiffstats
path: root/daemon
diff options
context:
space:
mode:
Diffstat (limited to 'daemon')
-rw-r--r--daemon/Makefile.am2
-rw-r--r--daemon/global_statistics.c90
-rw-r--r--daemon/main.c36
-rw-r--r--daemon/main.h2
-rw-r--r--daemon/signals.c103
5 files changed, 194 insertions, 39 deletions
diff --git a/daemon/Makefile.am b/daemon/Makefile.am
index e020e517..ee1b53d0 100644
--- a/daemon/Makefile.am
+++ b/daemon/Makefile.am
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-3.0-or-later
AUTOMAKE_OPTIONS = subdir-objects
-MAINTAINERCLEANFILES= $(srcdir)/Makefile.in
+MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
CLEANFILES = \
anonymous-statistics.sh \
$(NULL)
diff --git a/daemon/global_statistics.c b/daemon/global_statistics.c
index 53b7546f..2bcc5c9f 100644
--- a/daemon/global_statistics.c
+++ b/daemon/global_statistics.c
@@ -534,11 +534,30 @@ void global_statistics_charts(void) {
// ----------------------------------------------------------------
#ifdef ENABLE_DBENGINE
- if (localhost->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) {
- unsigned long long stats_array[RRDENG_NR_STATS];
+ RRDHOST *host;
+ unsigned long long stats_array[RRDENG_NR_STATS] = {0};
+ unsigned long long local_stats_array[RRDENG_NR_STATS];
+ unsigned hosts_with_dbengine = 0, i;
+
+ rrd_rdlock();
+ rrdhost_foreach_read(host) {
+ if (host->rrd_memory_mode == RRD_MEMORY_MODE_DBENGINE) {
+ ++hosts_with_dbengine;
+ /* get localhost's DB engine's statistics */
+ rrdeng_get_33_statistics(host->rrdeng_ctx, local_stats_array);
+ for (i = 0 ; i < RRDENG_NR_STATS ; ++i) {
+ /* aggregate statistics across hosts */
+ stats_array[i] += local_stats_array[i];
+ }
+ }
+ }
+ rrd_unlock();
- /* get localhost's DB engine's statistics */
- rrdeng_get_33_statistics(localhost->rrdeng_ctx, stats_array);
+ if (hosts_with_dbengine) {
+ /* deduplicate global statistics by getting the ones from the last host */
+ stats_array[30] = local_stats_array[30];
+ stats_array[31] = local_stats_array[31];
+ stats_array[32] = local_stats_array[32];
// ----------------------------------------------------------------
@@ -639,7 +658,7 @@ void global_statistics_charts(void) {
static RRDSET *st_pg_cache_pages = NULL;
static RRDDIM *rd_descriptors = NULL;
static RRDDIM *rd_populated = NULL;
- static RRDDIM *rd_commited = NULL;
+ static RRDDIM *rd_committed = NULL;
static RRDDIM *rd_insertions = NULL;
static RRDDIM *rd_deletions = NULL;
static RRDDIM *rd_backfills = NULL;
@@ -663,7 +682,7 @@ void global_statistics_charts(void) {
rd_descriptors = rrddim_add(st_pg_cache_pages, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_populated = rrddim_add(st_pg_cache_pages, "populated", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
- rd_commited = rrddim_add(st_pg_cache_pages, "commited", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ rd_committed = rrddim_add(st_pg_cache_pages, "committed", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
rd_insertions = rrddim_add(st_pg_cache_pages, "insertions", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
rd_deletions = rrddim_add(st_pg_cache_pages, "deletions", NULL, -1, 1, RRD_ALGORITHM_INCREMENTAL);
rd_backfills = rrddim_add(st_pg_cache_pages, "backfills", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
@@ -674,7 +693,7 @@ void global_statistics_charts(void) {
rrddim_set_by_pointer(st_pg_cache_pages, rd_descriptors, (collected_number)stats_array[27]);
rrddim_set_by_pointer(st_pg_cache_pages, rd_populated, (collected_number)stats_array[3]);
- rrddim_set_by_pointer(st_pg_cache_pages, rd_commited, (collected_number)stats_array[4]);
+ rrddim_set_by_pointer(st_pg_cache_pages, rd_committed, (collected_number)stats_array[4]);
rrddim_set_by_pointer(st_pg_cache_pages, rd_insertions, (collected_number)stats_array[5]);
rrddim_set_by_pointer(st_pg_cache_pages, rd_deletions, (collected_number)stats_array[6]);
rrddim_set_by_pointer(st_pg_cache_pages, rd_backfills, (collected_number)stats_array[9]);
@@ -818,6 +837,63 @@ void global_statistics_charts(void) {
rrddim_set_by_pointer(st_fd, rd_fd_max, (collected_number)rlimit_nofile.rlim_cur / 4);
rrdset_done(st_fd);
}
+
+ // ----------------------------------------------------------------
+
+ {
+ static RRDSET *st_ram_usage = NULL;
+ static RRDDIM *rd_cached = NULL;
+ static RRDDIM *rd_pinned = NULL;
+ static RRDDIM *rd_metadata = NULL;
+
+ collected_number cached_pages, pinned_pages, API_producers, populated_pages, metadata, pages_on_disk,
+ page_cache_descriptors;
+
+ if (unlikely(!st_ram_usage)) {
+ st_ram_usage = rrdset_create_localhost(
+ "netdata"
+ , "dbengine_ram"
+ , NULL
+ , "dbengine"
+ , NULL
+ , "NetData DB engine RAM usage"
+ , "MiB"
+ , "netdata"
+ , "stats"
+ , 130509
+ , localhost->rrd_update_every
+ , RRDSET_TYPE_STACKED
+ );
+
+ rd_cached = rrddim_add(st_ram_usage, "cache", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE);
+ rd_pinned = rrddim_add(st_ram_usage, "collectors", NULL, 1, 256, RRD_ALGORITHM_ABSOLUTE);
+ rd_metadata = rrddim_add(st_ram_usage, "metadata", NULL, 1, 1048576, RRD_ALGORITHM_ABSOLUTE);
+ }
+ else
+ rrdset_next(st_ram_usage);
+
+ API_producers = (collected_number)stats_array[0];
+ pages_on_disk = (collected_number)stats_array[2];
+ populated_pages = (collected_number)stats_array[3];
+ page_cache_descriptors = (collected_number)stats_array[27];
+
+ if (API_producers * 2 > populated_pages) {
+ pinned_pages = API_producers;
+ } else{
+ pinned_pages = API_producers * 2;
+ }
+ cached_pages = populated_pages - pinned_pages;
+
+ metadata = page_cache_descriptors * sizeof(struct page_cache_descr);
+ metadata += pages_on_disk * sizeof(struct rrdeng_page_descr);
+ /* This is an empirical estimation for Judy array indexing and extent structures */
+ metadata += pages_on_disk * 58;
+
+ rrddim_set_by_pointer(st_ram_usage, rd_cached, cached_pages);
+ rrddim_set_by_pointer(st_ram_usage, rd_pinned, pinned_pages);
+ rrddim_set_by_pointer(st_ram_usage, rd_metadata, metadata);
+ rrdset_done(st_ram_usage);
+ }
}
#endif
diff --git a/daemon/main.c b/daemon/main.c
index 4189ac7b..0e56654d 100644
--- a/daemon/main.c
+++ b/daemon/main.c
@@ -146,46 +146,28 @@ void web_server_config_options(void) {
}
-int killpid(pid_t pid, int signal)
-{
- int ret = -1;
+// killpid kills pid with SIGTERM.
+int killpid(pid_t pid) {
+ int ret;
debug(D_EXIT, "Request to kill pid %d", pid);
errno = 0;
- if(kill(pid, 0) == -1) {
+ ret = kill(pid, SIGTERM);
+ if (ret == -1) {
switch(errno) {
case ESRCH:
- error("Request to kill pid %d, but it is not running.", pid);
- break;
+ // We wanted the process to exit so just let the caller handle.
+ return ret;
case EPERM:
- error("Request to kill pid %d, but I do not have enough permissions.", pid);
+ error("Cannot kill pid %d, but I do not have enough permissions.", pid);
break;
default:
- error("Request to kill pid %d, but I received an error.", pid);
+ error("Cannot kill pid %d, but I received an error.", pid);
break;
}
}
- else {
- errno = 0;
- ret = kill(pid, signal);
- if(ret == -1) {
- switch(errno) {
- case ESRCH:
- error("Cannot kill pid %d, but it is not running.", pid);
- break;
-
- case EPERM:
- error("Cannot kill pid %d, but I do not have enough permissions.", pid);
- break;
-
- default:
- error("Cannot kill pid %d, but I received an error.", pid);
- break;
- }
- }
- }
return ret;
}
diff --git a/daemon/main.h b/daemon/main.h
index 68715598..9d9f4ef0 100644
--- a/daemon/main.h
+++ b/daemon/main.h
@@ -41,7 +41,7 @@ struct netdata_static_thread {
};
extern void cancel_main_threads(void);
-extern int killpid(pid_t pid, int signal);
+extern int killpid(pid_t pid);
extern void netdata_cleanup_and_exit(int ret) NORETURN;
extern void send_statistics(const char *action, const char *action_result, const char *action_data);
diff --git a/daemon/signals.c b/daemon/signals.c
index 71f27188..5378b04e 100644
--- a/daemon/signals.c
+++ b/daemon/signals.c
@@ -2,6 +2,8 @@
#include "common.h"
+static int reaper_enabled = 0;
+
typedef enum signal_action {
NETDATA_SIGNAL_END_OF_LIST,
NETDATA_SIGNAL_IGNORE,
@@ -10,6 +12,7 @@ typedef enum signal_action {
NETDATA_SIGNAL_LOG_ROTATE,
NETDATA_SIGNAL_RELOAD_HEALTH,
NETDATA_SIGNAL_FATAL,
+ NETDATA_SIGNAL_CHILD,
} SIGNAL_ACTION;
static struct {
@@ -26,6 +29,7 @@ static struct {
{ SIGUSR1, "SIGUSR1", 0, NETDATA_SIGNAL_SAVE_DATABASE },
{ SIGUSR2, "SIGUSR2", 0, NETDATA_SIGNAL_RELOAD_HEALTH },
{ SIGBUS, "SIGBUS", 0, NETDATA_SIGNAL_FATAL },
+ { SIGCHLD, "SIGCHLD", 0, NETDATA_SIGNAL_CHILD },
// terminator
{ 0, "NONE", 0, NETDATA_SIGNAL_END_OF_LIST }
@@ -42,7 +46,7 @@ static void signal_handler(int signo) {
char buffer[200 + 1];
snprintfz(buffer, 200, "\nSIGNAL HANLDER: received: %s. Oops! This is bad!\n", signals_waiting[i].name);
if(write(STDERR_FILENO, buffer, strlen(buffer)) == -1) {
- // nothing to do - we cannot write but there is no way to complaint about it
+ // nothing to do - we cannot write but there is no way to complain about it
;
}
}
@@ -74,15 +78,33 @@ void signals_init(void) {
struct sigaction sa;
sa.sa_flags = 0;
+ // Enable process tracking / reaper if running as init (pid == 1).
+ // This prevents zombie processes when running in a container.
+ if (getpid() == 1) {
+ info("SIGNAL: Enabling reaper");
+ myp_init();
+ reaper_enabled = 1;
+ } else {
+ info("SIGNAL: Not enabling reaper");
+ }
+
// ignore all signals while we run in a signal handler
sigfillset(&sa.sa_mask);
int i;
for (i = 0; signals_waiting[i].action != NETDATA_SIGNAL_END_OF_LIST; i++) {
- if(signals_waiting[i].action == NETDATA_SIGNAL_IGNORE)
+ switch (signals_waiting[i].action) {
+ case NETDATA_SIGNAL_IGNORE:
sa.sa_handler = SIG_IGN;
- else
+ break;
+ case NETDATA_SIGNAL_CHILD:
+ if (reaper_enabled == 0)
+ continue;
+ // FALLTHROUGH
+ default:
sa.sa_handler = signal_handler;
+ break;
+ }
if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1)
error("SIGNAL: Failed to change signal handler for: %s", signals_waiting[i].name);
@@ -100,6 +122,76 @@ void signals_reset(void) {
if(sigaction(signals_waiting[i].signo, &sa, NULL) == -1)
error("SIGNAL: Failed to reset signal handler for: %s", signals_waiting[i].name);
}
+
+ if (reaper_enabled == 1)
+ myp_free();
+}
+
+// reap_child reaps the child identified by pid.
+static void reap_child(pid_t pid) {
+ siginfo_t i;
+
+ errno = 0;
+ debug(D_CHILDS, "SIGNAL: Reaping pid: %d...", pid);
+ if (waitid(P_PID, (id_t)pid, &i, WEXITED|WNOHANG) == -1) {
+ if (errno != ECHILD)
+ error("SIGNAL: Failed to wait for: %d", pid);
+ else
+ debug(D_CHILDS, "SIGNAL: Already reaped: %d", pid);
+ return;
+ } else if (i.si_pid == 0) {
+ // Process didn't exit, this shouldn't happen.
+ return;
+ }
+
+ switch (i.si_code) {
+ case CLD_EXITED:
+ debug(D_CHILDS, "SIGNAL: Child %d exited: %d", pid, i.si_status);
+ break;
+ case CLD_KILLED:
+ debug(D_CHILDS, "SIGNAL: Child %d killed by signal: %d", pid, i.si_status);
+ break;
+ case CLD_DUMPED:
+ debug(D_CHILDS, "SIGNAL: Child %d dumped core by signal: %d", pid, i.si_status);
+ break;
+ case CLD_STOPPED:
+ debug(D_CHILDS, "SIGNAL: Child %d stopped by signal: %d", pid, i.si_status);
+ break;
+ case CLD_TRAPPED:
+ debug(D_CHILDS, "SIGNAL: Child %d trapped by signal: %d", pid, i.si_status);
+ break;
+ case CLD_CONTINUED:
+ debug(D_CHILDS, "SIGNAL: Child %d continued by signal: %d", pid, i.si_status);
+ break;
+ default:
+ debug(D_CHILDS, "SIGNAL: Child %d gave us a SIGCHLD with code %d and status %d.", pid, i.si_code, i.si_status);
+ }
+}
+
+// reap_children reaps all pending children which are not managed by myp.
+static void reap_children() {
+ siginfo_t i;
+
+ while (1 == 1) {
+ // Identify which process caused the signal so we can determine
+ // if we need to reap a re-parented process.
+ i.si_pid = 0;
+ if (waitid(P_ALL, (id_t)0, &i, WEXITED|WNOHANG|WNOWAIT) == -1) {
+ if (errno != ECHILD) // This shouldn't happen with WNOHANG but does.
+ error("SIGNAL: Failed to wait");
+ return;
+ } else if (i.si_pid == 0) {
+ // No child exited.
+ return;
+ } else if (myp_reap(i.si_pid) == 0) {
+ // myp managed, sleep for a short time to avoid busy wait while
+ // this is handled by myp.
+ usleep(10000);
+ } else {
+ // Unknown process, likely a re-parented child, reap it.
+ reap_child(i.si_pid);
+ }
+ }
}
void signals_handle(void) {
@@ -157,6 +249,11 @@ void signals_handle(void) {
case NETDATA_SIGNAL_FATAL:
fatal("SIGNAL: Received %s. netdata now exits.", name);
+ case NETDATA_SIGNAL_CHILD:
+ debug(D_CHILDS, "SIGNAL: Received %s. Reaping...", name);
+ reap_children();
+ break;
+
default:
info("SIGNAL: Received %s. No signal handler configured. Ignoring it.", name);
break;