summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c156
1 files changed, 99 insertions, 57 deletions
diff --git a/health/health.c b/health/health.c
index d49021ed0..c1a11167c 100644
--- a/health/health.c
+++ b/health/health.c
@@ -82,10 +82,13 @@ static bool prepare_command(BUFFER *wb,
const char *edit_command,
const char *machine_guid,
uuid_t *transition_id,
- const char *summary
+ const char *summary,
+ const char *context,
+ const char *component,
+ const char *type
) {
char buf[8192];
- size_t n = 8192 - 1;
+ size_t n = sizeof(buf) - 1;
buffer_strcat(wb, "exec");
@@ -195,6 +198,18 @@ static bool prepare_command(BUFFER *wb,
return false;
buffer_sprintf(wb, " '%s'", buf);
+ if (!sanitize_command_argument_string(buf, context, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, component, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, type, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
return true;
}
@@ -342,7 +357,9 @@ static void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
- netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Reloading health.",
+ rrdhost_hostname(host));
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
@@ -383,7 +400,7 @@ static void health_reload_host(RRDHOST *host) {
#ifdef ENABLE_ACLK
if (netdata_cloud_enabled) {
- struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
+ struct aclk_sync_cfg_t *wc = host->aclk_config;
if (likely(wc)) {
wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
}
@@ -422,7 +439,7 @@ static inline int compare_active_alerts(const void * a, const void * b) {
active_alerts_t *active_alerts_a = (active_alerts_t *)a;
active_alerts_t *active_alerts_b = (active_alerts_t *)b;
- return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
+ return (int) ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
}
static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
@@ -436,8 +453,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
// mark it as run, so that we will send the same alarm if it happens again
goto done;
@@ -454,10 +473,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have executed this alarm notification in the past
if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
// don't send the notification for the same status again
- netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_id(ae), ae_name(ae)
- , rrdcalc_status2string(ae->new_status));
- netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae)
- , rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending again notification for alarm '%s.%s' status %s",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae),
+ rrdcalc_status2string(ae->new_status));
goto done;
}
}
@@ -476,11 +495,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s "
+ "(command API has disabled notifications)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Sending notification for alarm '%s.%s' status %s.",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
@@ -581,7 +605,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
edit_command,
host->machine_guid,
&ae->transition_id,
- host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae));
+ host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae),
+ string2str(ae->chart_context),
+ string2str(ae->component),
+ string2str(ae->type)
+ );
const char *command_to_run = buffer_tostring(wb);
if (ok) {
@@ -778,7 +806,8 @@ static void health_main_cleanup(void *ptr) {
netdata_log_info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- netdata_log_health("Health thread ended.");
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Health thread ended.");
}
static void initialize_health(RRDHOST *host)
@@ -790,7 +819,9 @@ static void initialize_health(RRDHOST *host)
rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
- netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Initializing health.",
+ rrdhost_hostname(host));
host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
@@ -803,7 +834,11 @@ static void initialize_health(RRDHOST *host)
long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
if(n < 10) {
- netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Host '%s': health configuration has invalid max log entries %ld. "
+ "Using default %u",
+ rrdhost_hostname(host), n, host->health_log.max);
+
config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
}
else
@@ -811,7 +846,11 @@ static void initialize_health(RRDHOST *host)
uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
if (m < HEALTH_LOG_MINIMUM_HISTORY) {
- netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+ nd_log(NDLS_DAEMON, NDLP_WARNING,
+ "Host '%s': health configuration has invalid health log history %u. "
+ "Using minimum %d",
+ rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+
config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
m = HEALTH_LOG_MINIMUM_HISTORY;
}
@@ -823,7 +862,9 @@ static void initialize_health(RRDHOST *host)
} else
host->health_log.health_log_history = m;
- netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health log history is set to %u seconds (%u days)",
+ rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
SIMPLE_PATTERN_EXACT, true);
@@ -871,10 +912,9 @@ static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
}
}
-static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
+static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host)
+{
SILENCER *s;
- netdata_log_debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s",
- rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host);
for (s = silencers->silencers; s!=NULL; s=s->next){
if (
@@ -919,20 +959,20 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
} else {
- SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
+ SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host));
if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
}
if (rrdcalc_flags_old != rc->run_flags) {
- netdata_log_info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
- rrdhost_hostname(host),
- rrdcalc_name(rc),
- (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
- (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
- );
+ netdata_log_info(
+ "Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
+ rrdhost_hostname(host),
+ rrdcalc_name(rc),
+ (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED) ? "true" : "false",
+ (rc->run_flags & RRDCALC_FLAG_DISABLED) ? "true" : "false",
+ (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED) ? "true" : "false",
+ (rc->run_flags & RRDCALC_FLAG_SILENCED) ? "true" : "false");
}
if (rc->run_flags & RRDCALC_FLAG_DISABLED)
return 1;
@@ -943,7 +983,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
#ifdef ENABLE_ACLK
if (netdata_cloud_enabled) {
- struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
+ struct aclk_sync_cfg_t *wc = host->aclk_config;
if (unlikely(!wc)) {
return;
}
@@ -1049,7 +1089,7 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- netdata_log_health(
+ nd_log(NDLS_DAEMON, NDLP_NOTICE,
"Postponing alarm checks for %"PRId64" seconds, "
"because it seems that the system was just resumed from suspension.",
(int64_t)hibernation_delay);
@@ -1058,8 +1098,9 @@ void *health_main(void *ptr) {
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
static int logged=0;
if (!logged) {
- netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.",
- HEALTH_CMDAPI_CMD_DISABLEALL);
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "Skipping health checks, because all alarms are disabled via a %s command.",
+ HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
}
@@ -1081,7 +1122,7 @@ void *health_main(void *ptr) {
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
if (unlikely(apply_hibernation_delay)) {
- netdata_log_health(
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
"[%s]: Postponing health checks for %"PRId64" seconds.",
rrdhost_hostname(host),
(int64_t)hibernation_delay);
@@ -1094,20 +1135,30 @@ void *health_main(void *ptr) {
continue;
}
- netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Resuming health checks after delay.",
+ rrdhost_hostname(host));
+
host->health.health_delay_up_to = 0;
}
// wait until cleanup of obsolete charts on children is complete
if (host != localhost) {
if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
- netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Waiting for chart obsoletion check.",
+ rrdhost_hostname(host));
+
continue;
}
}
if (!health_running_logged) {
- netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health is running.",
+ rrdhost_hostname(host));
+
health_running_logged = true;
}
@@ -1161,6 +1212,7 @@ void *health_main(void *ptr) {
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
if (ae) {
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
rc->old_status = rc->status;
rc->status = RRDCALC_STATUS_REMOVED;
@@ -1432,9 +1484,13 @@ void *health_main(void *ptr) {
)
);
+ health_log_alert(host, ae);
health_alarm_log_add_entry(host, ae);
- netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Alert event for [%s.%s], value [%s], status [%s].",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), ae_new_value_string(ae),
+ rrdcalc_status2string(ae->new_status));
rc->last_status_change_value = rc->value;
rc->last_status_change = now;
@@ -1519,6 +1575,7 @@ void *health_main(void *ptr) {
)
);
+ health_log_alert(host, ae);
ae->last_repeat = rc->last_repeat;
if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
ae->flags |= HEALTH_ENTRY_RUN_ONCE;
@@ -1554,10 +1611,9 @@ void *health_main(void *ptr) {
}
#ifdef ENABLE_ACLK
if (netdata_cloud_enabled) {
- struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
- if (unlikely(!wc)) {
+ struct aclk_sync_cfg_t *wc = host->aclk_config;
+ if (unlikely(!wc))
continue;
- }
if (wc->alert_queue_removed == 1) {
sql_queue_removed_alerts_to_aclk(host);
@@ -1594,17 +1650,3 @@ void *health_main(void *ptr) {
netdata_thread_cleanup_pop(1);
return NULL;
}
-
-void health_add_host_labels(void) {
- RRDLABELS *labels = localhost->rrdlabels;
-
- // The source should be CONF, but when it is set, these labels are exported by default ('send configured labels' in exporting.conf).
- // Their export seems to break exporting to Graphite, see https://github.com/netdata/netdata/issues/14084.
-
- int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO);
- rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_AUTO);
-
- int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO);
- rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_AUTO);
-}
-