summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2022-11-30 18:47:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2022-11-30 18:47:05 +0000
commit97e01009d69b8fbebfebf68f51e3d126d0ed43fc (patch)
tree02e8b836c3a9d89806f3e67d4a5fe9f52dbb0061 /health/health.c
parentReleasing debian version 1.36.1-1. (diff)
downloadnetdata-97e01009d69b8fbebfebf68f51e3d126d0ed43fc.tar.xz
netdata-97e01009d69b8fbebfebf68f51e3d126d0ed43fc.zip
Merging upstream version 1.37.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c1431
1 files changed, 882 insertions, 549 deletions
diff --git a/health/health.c b/health/health.c
index 9eb36a9c6..3784e0f31 100644
--- a/health/health.c
+++ b/health/health.c
@@ -2,11 +2,166 @@
#include "health.h"
+#define WORKER_HEALTH_JOB_RRD_LOCK 0
+#define WORKER_HEALTH_JOB_HOST_LOCK 1
+#define WORKER_HEALTH_JOB_DB_QUERY 2
+#define WORKER_HEALTH_JOB_CALC_EVAL 3
+#define WORKER_HEALTH_JOB_WARNING_EVAL 4
+#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
+#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
+#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
+#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
+#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
+#endif
+
+static bool prepare_command(BUFFER *wb,
+ const char *exec,
+ const char *recipient,
+ const char *registry_hostname,
+ uint32_t unique_id,
+ uint32_t alarm_id,
+ uint32_t alarm_event_id,
+ uint32_t when,
+ const char *alert_name,
+ const char *alert_chart_name,
+ const char *alert_family,
+ const char *new_status,
+ const char *old_status,
+ NETDATA_DOUBLE new_value,
+ NETDATA_DOUBLE old_value,
+ const char *alert_source,
+ uint32_t duration,
+ uint32_t non_clear_duration,
+ const char *alert_units,
+ const char *alert_info,
+ const char *new_value_string,
+ const char *old_value_string,
+ const char *source,
+ const char *error_msg,
+ int n_warn,
+ int n_crit,
+ const char *warn_alarms,
+ const char *crit_alarms,
+ const char *classification,
+ const char *edit_command,
+ const char *machine_guid)
+{
+ char buf[8192];
+ size_t n = 8192 - 1;
+
+ buffer_strcat(wb, "exec");
+
+ if (!sanitize_command_argument_string(buf, exec, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, recipient, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, registry_hostname, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%u'", unique_id);
+
+ buffer_sprintf(wb, " '%u'", alarm_id);
+
+ buffer_sprintf(wb, " '%u'", alarm_event_id);
+
+ buffer_sprintf(wb, " '%u'", when);
+
+ if (!sanitize_command_argument_string(buf, alert_name, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_chart_name, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_family, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, new_status, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, old_status, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
+
+ buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
+
+ if (!sanitize_command_argument_string(buf, alert_source, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%u'", duration);
+
+ buffer_sprintf(wb, " '%u'", non_clear_duration);
+
+ if (!sanitize_command_argument_string(buf, alert_units, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_info, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, new_value_string, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, old_value_string, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, source, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, error_msg, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%d'", n_warn);
+
+ buffer_sprintf(wb, " '%d'", n_crit);
+
+ if (!sanitize_command_argument_string(buf, warn_alarms, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, crit_alarms, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, classification, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, edit_command, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, machine_guid, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ return true;
+}
+
unsigned int default_health_enabled = 1;
char *silencers_filename;
// the queue of executed alarm notifications that haven't been waited for yet
-static struct {
+static __thread struct {
ALARM_ENTRY *head; // oldest
ALARM_ENTRY *tail; // latest
} alarm_notifications_in_progress = {NULL, NULL};
@@ -146,77 +301,51 @@ void health_init(void) {
* @param host the structure of the host that the function will reload the configuration.
*/
static void health_reload_host(RRDHOST *host) {
- if(unlikely(!host->health_enabled))
+ if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
+ log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
// free all running alarms
- rrdhost_wrlock(host);
-
- while(host->templates)
- rrdcalctemplate_unlink_and_free(host, host->templates);
-
- RRDCALCTEMPLATE *rt,*next;
- for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
- next = rt->next;
- rrdcalctemplate_free(rt);
- }
- host->alarms_template_with_foreach = NULL;
-
- while(host->alarms)
- rrdcalc_unlink_and_free(host, host->alarms);
-
- RRDCALC *rc,*nc;
- for(rc = host->alarms_with_foreach; rc ; rc = nc) {
- nc = rc->next;
- rrdcalc_free(rc);
- }
- host->alarms_with_foreach = NULL;
-
- rrdhost_unlock(host);
+ rrdcalc_delete_all(host);
+ rrdcalctemplate_delete_all(host);
// invalidate all previous entries in the alarm log
+ netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *t;
for(t = host->health_log.alarms ; t ; t = t->next) {
if(t->new_status != RRDCALC_STATUS_REMOVED)
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
}
+ netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- rrdhost_rdlock(host);
// reset all thresholds to all charts
RRDSET *st;
rrdset_foreach_read(st, host) {
st->green = NAN;
st->red = NAN;
}
- rrdhost_unlock(host);
+ rrdset_foreach_done(st);
// load the new alarms
- rrdhost_wrlock(host);
health_readdir(host, user_path, stock_path, NULL);
//Discard alarms with labels that do not apply to host
- rrdcalc_labels_unlink_alarm_from_host(host);
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
// link the loaded alarms to their charts
- RRDDIM *rd;
rrdset_foreach_write(st, host) {
if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
continue;
- rrdsetcalc_link_matching(st);
- rrdcalctemplate_link_matching(st);
- //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
- rrdset_rdlock(st);
- rrddim_foreach_read(rd, st) {
- rrdcalc_link_to_rrddim(rd, st, host);
- }
- rrdset_unlock(st);
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
}
-
- rrdhost_unlock(host);
+ rrdset_foreach_done(st);
+ host->aclk_alert_reloaded = 1;
}
/**
@@ -234,11 +363,6 @@ void health_reload(void) {
health_reload_host(host);
rrd_unlock();
-#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
- aclk_alert_reloaded = 1;
- }
-#endif
}
// ----------------------------------------------------------------------------
@@ -250,7 +374,6 @@ static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
return RRDCALC_STATUS_CLEAR;
}
-#define ALARM_EXEC_COMMAND_LENGTH 8192
#define ACTIVE_ALARMS_LIST_EXAMINE 500
#define ACTIVE_ALARMS_LIST 15
@@ -266,13 +389,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
// do not send notifications for internal statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
// mark it as run, so that we will send the same alarm if it happens again
goto done;
}
@@ -292,7 +416,9 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have executed this alarm notification in the past
if(t && t->new_status == ae->new_status) {
// don't send the notification for the same status again
- debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
+ debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
+ , rrdcalc_status2string(ae->new_status));
+ log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -303,7 +429,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
}
@@ -312,14 +438,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
+ log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
- const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
+ const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec);
+ const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient);
int n_warn=0, n_crit=0;
RRDCALC *rc;
@@ -330,13 +456,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
+ if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
+ break;
+
if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
active_alerts[n_warn+n_crit].status = rc->status;
n_warn++;
@@ -344,7 +473,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
active_alerts[n_warn+n_crit].status = rc->status;
n_crit++;
@@ -355,6 +484,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
expr = rc->warning;
}
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
if (n_warn+n_crit>1)
qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
@@ -379,51 +509,55 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
}
}
- char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
-
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
- "' '" NETDATA_DOUBLE_FORMAT_ZERO
- "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
- exec,
- recipient,
- host->registry_hostname,
- ae->unique_id,
- ae->alarm_id,
- ae->alarm_event_id,
- (unsigned long)ae->when,
- ae->name,
- ae->chart?ae->chart:"NOCHART",
- ae->family?ae->family:"NOFAMILY",
- rrdcalc_status2string(ae->new_status),
- rrdcalc_status2string(ae->old_status),
- ae->new_value,
- ae->old_value,
- ae->source?ae->source:"UNKNOWN",
- (uint32_t)ae->duration,
- (uint32_t)ae->non_clear_duration,
- ae->units?ae->units:"",
- ae->info?ae->info:"",
- ae->new_value_string,
- ae->old_value_string,
- (expr && expr->source)?expr->source:"NOSOURCE",
- (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
- n_warn,
- n_crit,
- buffer_tostring(warn_alarms),
- buffer_tostring(crit_alarms),
- ae->classification?ae->classification:"Unknown",
- edit_command,
- host != localhost ? host->machine_guid:""
- );
-
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
- ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
-
- debug(D_HEALTH, "executing command '%s'", command_to_run);
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
- ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
- enqueue_alarm_notify_in_progress(ae);
+ char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
+
+ BUFFER *wb = buffer_create(8192);
+ bool ok = prepare_command(wb,
+ exec,
+ recipient,
+ rrdhost_registry_hostname(host),
+ ae->unique_id,
+ ae->alarm_id,
+ ae->alarm_event_id,
+ (unsigned long)ae->when,
+ ae_name(ae),
+ ae->chart?ae_chart_name(ae):"NOCHART",
+ ae->family?ae_family(ae):"NOFAMILY",
+ rrdcalc_status2string(ae->new_status),
+ rrdcalc_status2string(ae->old_status),
+ ae->new_value,
+ ae->old_value,
+ ae->source?ae_source(ae):"UNKNOWN",
+ (uint32_t)ae->duration,
+ (uint32_t)ae->non_clear_duration,
+ ae_units(ae),
+ ae_info(ae),
+ ae_new_value_string(ae),
+ ae_old_value_string(ae),
+ (expr && expr->source)?expr->source:"NOSOURCE",
+ (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
+ n_warn,
+ n_crit,
+ buffer_tostring(warn_alarms),
+ buffer_tostring(crit_alarms),
+ ae->classification?ae_classification(ae):"Unknown",
+ edit_command,
+ host != localhost ? host->machine_guid:"");
+
+ const char *command_to_run = buffer_tostring(wb);
+ if (ok) {
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
+ ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
+
+ debug(D_HEALTH, "executing command '%s'", command_to_run);
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
+ ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
+ enqueue_alarm_notify_in_progress(ae);
+ } else {
+ error("Failed to format command arguments");
+ }
+ buffer_free(wb);
freez(edit_command);
buffer_free(warn_alarms);
buffer_free(crit_alarms);
@@ -450,7 +584,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
- ae->chart?ae->chart:"NOCHART", ae->name,
+ ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
ae->new_value,
rrdcalc_status2string(ae->old_status),
rrdcalc_status2string(ae->new_status)
@@ -467,7 +601,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
@@ -481,13 +615,13 @@ static inline void health_alarm_log_process(RRDHOST *host) {
}
}
+ netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
-
if (!cleanup_excess_log_entries)
return;
@@ -508,7 +642,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
- if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
host->health_log.count--;
@@ -522,7 +656,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
if(unlikely(!rc->rrdset)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -533,40 +667,38 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
*next_run = rc->next_update;
}
- debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
+ debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
return 0;
}
if(unlikely(!rc->update_every)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
int update_every = rc->rrdset->update_every;
- rrdset_rdlock(rc->rrdset);
- time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
- time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
- rrdset_unlock(rc->rrdset);
+ time_t first = rrdset_first_entry_t(rc->rrdset);
+ time_t last = rrdset_last_entry_t(rc->rrdset);
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
, (unsigned long) last);
return 0;
}
@@ -577,7 +709,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
if(needed + update_every < first || needed - update_every > last) {
debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
, (unsigned long) last);
return 0;
}
@@ -587,7 +719,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
}
static inline int check_if_resumed_from_suspension(void) {
- static usec_t last_realtime = 0, last_monotonic = 0;
+ static __thread usec_t last_realtime = 0, last_monotonic = 0;
usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
int ret = 0;
@@ -603,41 +735,142 @@ static inline int check_if_resumed_from_suspension(void) {
return ret;
}
-static void health_main_cleanup(void *ptr) {
+static void health_thread_cleanup(void *ptr) {
worker_unregister();
- struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
- static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
+ struct health_state *h = ptr;
+ h->host->health_spawn = 0;
+
+ netdata_thread_cancel(netdata_thread_self());
+ log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host));
+ debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host));
+}
+
+static void initialize_health(RRDHOST *host, int is_localhost) {
+ if(!host->health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return;
+ rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
+
+ log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+
+ host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
+ host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
+
+ host->health_log.next_log_id = 1;
+ host->health_log.next_alarm_id = 1;
+ host->health_log.max = 1000;
+ host->health_log.next_log_id = (uint32_t)now_realtime_sec();
+ host->health_log.next_alarm_id = 0;
+
+ long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
+ if(n < 10) {
+ error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
+ }
+ else
+ host->health_log.max = (unsigned int)n;
+
+ netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
+
+ char filename[FILENAME_MAX + 1];
+
+ if(!is_localhost) {
+ int r = mkdir(host->varlib_dir, 0775);
+ if (r != 0 && errno != EEXIST)
+ error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir);
+ }
+
+ {
+ snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
+ int r = mkdir(filename, 0775);
+ if(r != 0 && errno != EEXIST)
+ error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename);
+ }
+ snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir);
+ host->health_log_filename = strdupz(filename);
+
+ snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
+ host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
+ host->health_default_recipient = string_strdupz("root");
+
+ if (!file_is_migrated(host->health_log_filename)) {
+ int rc = sql_create_health_log_table(host);
+ if (unlikely(rc)) {
+ log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host));
+ health_alarm_log_load(host);
+ health_alarm_log_open(host);
+ }
+ else {
+ health_alarm_log_load(host);
+ add_migrated_file(host->health_log_filename, 0);
+ }
+ } else {
+ // TODO: This needs to go to the metadata thread
+ // Health should wait before accessing the table (needs to be created by the metadata thread)
+ sql_create_health_log_table(host);
+ sql_health_alarm_log_load(host);
+ }
+
+ // ------------------------------------------------------------------------
+ // load health configuration
+
+ health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
- info("cleaning up...");
+ // link the loaded alarms to their charts
+ RRDSET *st;
+ rrdset_foreach_write(st, host) {
+ if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
+ continue;
+
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
+ }
+ rrdset_foreach_done(st);
- static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
+ //Discard alarms with labels that do not apply to host
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
+
+ health_silencers_init();
+}
+
+static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) {
+ time_t now = now_realtime_sec();
+ if(now < next_run) {
+ worker_is_idle();
+ debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
+ while (now < next_run && host->health_enabled && !netdata_exit) {
+ sleep_usec(USEC_PER_SEC);
+ now = now_realtime_sec();
+ }
+ }
+ else {
+ debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+ }
}
-static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
+static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
SILENCER *s;
debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
- rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
+ rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
for (s = silencers->silencers; s!=NULL; s=s->next){
if (
- (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
- (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
+ (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
+ (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
(!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
- (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
- (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
+ (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
+ (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
) {
debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
if (unlikely(silencers->stype == STYPE_NONE)) {
- debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
+ debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
- , rc->name
- , (rc->rrdset)?rc->rrdset->context:""
- , rc->chart
+ , rrdcalc_name(rc)
+ , (rc->rrdset)?rrdset_context(rc->rrdset):""
+ , rrdcalc_chart_name(rc)
, host
- , (rc->rrdset)?rc->rrdset->family:""
+ , (rc->rrdset)?rrdset_family(rc->rrdset):""
);
}
return silencers->stype;
@@ -657,66 +890,86 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers
* @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
*/
static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
- uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
+ uint32_t rrdcalc_flags_old = rc->run_flags;
// Clear the flags
- rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
+ rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
if (unlikely(silencers->all_alarms)) {
- if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
+ if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
+ else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
} else {
- SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
- if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
+ SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
+ if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
+ else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
}
- if (rrdcalc_flags_old != rc->rrdcalc_flags) {
+ if (rrdcalc_flags_old != rc->run_flags) {
info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
- host->hostname,
- rc->name,
+ rrdhost_hostname(host),
+ rrdcalc_name(rc),
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
+ (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
(rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
+ (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
);
}
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
+ if (rc->run_flags & RRDCALC_FLAG_DISABLED)
return 1;
else
return 0;
}
-// Create alarms for dimensions that have been added to charts
-// since the previous iteration.
-static void init_pending_foreach_alarms(RRDHOST *host) {
+static void health_execute_delayed_initializations(RRDHOST *host) {
RRDSET *st;
- RRDDIM *rd;
- if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
- return;
+ if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
+ rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
- rrdhost_wrlock(host);
+ rrdset_foreach_reentrant(st, host) {
+ if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
+ rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
- rrdset_foreach_write(st, host) {
- if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
- continue;
+ worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
+
+ if(!st->rrdfamily)
+ st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
+
+ if(!st->rrdvars)
+ st->rrdvars = rrdvariables_create();
+
+ rrddimvar_index_init(st);
- rrdset_rdlock(st);
+ rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE);
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
+
+ RRDDIM *rd;
rrddim_foreach_read(rd, st) {
- if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
- continue;
+ if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
+ rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
- rrdcalc_link_to_rrddim(rd, st, host);
+ worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
- }
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE);
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE);
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
- rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
- rrdset_unlock(st);
- }
+ RRDCALCTEMPLATE *rt;
+ foreach_rrdcalctemplate_read(host, rt) {
+ if(!rt->foreach_dimension_pattern)
+ continue;
- rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
- rrdhost_unlock(host);
+ if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
+ rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
+ }
+ foreach_rrdcalctemplate_done(rt);
+ }
+ rrddim_foreach_done(rd);
+ }
+ rrdset_foreach_done(st);
}
/**
@@ -729,19 +982,6 @@ static void init_pending_foreach_alarms(RRDHOST *host) {
* @return It always returns NULL
*/
-#define WORKER_HEALTH_JOB_RRD_LOCK 0
-#define WORKER_HEALTH_JOB_HOST_LOCK 1
-#define WORKER_HEALTH_JOB_DB_QUERY 2
-#define WORKER_HEALTH_JOB_CALC_EVAL 3
-#define WORKER_HEALTH_JOB_WARNING_EVAL 4
-#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
-#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
-#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
-
-#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
-#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
-#endif
-
void *health_main(void *ptr) {
worker_register("HEALTH");
worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
@@ -752,8 +992,14 @@ void *health_main(void *ptr) {
worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
+ worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
+ worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
- netdata_thread_cleanup_push(health_main_cleanup, ptr);
+ struct health_state *h = ptr;
+ netdata_thread_cleanup_push(health_thread_cleanup, ptr);
+
+ RRDHOST *host = h->host;
+ initialize_health(host, host == localhost);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
if(min_run_every < 1) min_run_every = 1;
@@ -763,16 +1009,21 @@ void *health_main(void *ptr) {
time_t now = now_realtime_sec();
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
- rrdcalc_labels_unlink();
+ bool health_running_logged = false;
+
+ rrdhost_rdlock(host); //CHECK
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
+ rrdhost_unlock(host);
unsigned int loop = 0;
#ifdef ENABLE_ACLK
unsigned int marked_aclk_reload_loop = 0;
#endif
- while(!netdata_exit) {
+ while(!netdata_exit && host->health_enabled) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
+ now = now_realtime_sec();
int runnable = 0, apply_hibernation_delay = 0;
time_t next_run = now + min_run_every;
RRDCALC *rc;
@@ -780,433 +1031,500 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- info(
- "Postponing alarm checks for %"PRId64" seconds, "
- "because it seems that the system was just resumed from suspension.",
- (int64_t)hibernation_delay);
+ log_health(
+ "[%s]: Postponing alarm checks for %"PRId64" seconds, "
+ "because it seems that the system was just resumed from suspension.",
+ rrdhost_hostname(host),
+ (int64_t)hibernation_delay);
}
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
- static int logged=0;
+ static __thread int logged=0;
if (!logged) {
- info("Skipping health checks, because all alarms are disabled via a %s command.",
- HEALTH_CMDAPI_CMD_DISABLEALL);
+ log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.",
+ rrdhost_hostname(host),
+ HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
}
#ifdef ENABLE_ACLK
- if (aclk_alert_reloaded && !marked_aclk_reload_loop)
+ if (host->aclk_alert_reloaded && !marked_aclk_reload_loop)
marked_aclk_reload_loop = loop;
#endif
- worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
- rrd_rdlock();
+ if (unlikely(apply_hibernation_delay)) {
+ log_health(
+ "[%s]: Postponing health checks for %"PRId64" seconds.",
+ rrdhost_hostname(host),
+ (int64_t)hibernation_delay);
- RRDHOST *host;
- rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
+ host->health_delay_up_to = now + hibernation_delay;
+ next_run = now + hibernation_delay;
+ health_sleep(next_run, loop, host);
+ }
+
+ if (unlikely(host->health_delay_up_to)) {
+ if (unlikely(now < host->health_delay_up_to)) {
+ next_run = host->health_delay_up_to;
+ health_sleep(next_run, loop, host);
continue;
+ }
- if (unlikely(apply_hibernation_delay)) {
- info(
- "Postponing health checks for %"PRId64" seconds, on host '%s'.",
- (int64_t)hibernation_delay,
- host->hostname);
+ log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ host->health_delay_up_to = 0;
+ }
- host->health_delay_up_to = now + hibernation_delay;
+ // wait until cleanup of obsolete charts on children is complete
+ if (host != localhost) {
+ if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
+ log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+ health_sleep(next_run, loop, host);
+ continue;
}
+ }
- if (unlikely(host->health_delay_up_to)) {
- if (unlikely(now < host->health_delay_up_to))
- continue;
+ if (!health_running_logged) {
+ log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ health_running_logged = true;
+ }
- info("Resuming health checks on host '%s'.", host->hostname);
- host->health_delay_up_to = 0;
- }
+ if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
+ sql_health_alarm_log_cleanup(host);
- // wait until cleanup of obsolete charts on children is complete
- if (host != localhost)
- if (unlikely(host->trigger_chart_obsoletion_check == 1))
- continue;
+ health_execute_delayed_initializations(host);
- if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
- sql_health_alarm_log_cleanup(host);
+ worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
- init_pending_foreach_alarms(host);
+ // the first loop is to lookup values from the db
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
- worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
- rrdhost_rdlock(host);
+ rrdcalc_update_info_using_rrdset_labels(rc);
- // the first loop is to lookup values from the db
- for (rc = host->alarms; rc; rc = rc->next) {
+ if (update_disabled_silenced(host, rc))
+ continue;
- if (update_disabled_silenced(host, rc))
- continue;
+ // create an alert removed event if the chart is obsolete and
+ // has stopped being collected for 60 seconds
+ if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
+ rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
+ now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
+ if (!rrdcalc_isrepeating(rc)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
+ time_t now = now_realtime_sec();
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->value,
+ NAN,
+ rc->status,
+ RRDCALC_STATUS_REMOVED,
+ rc->source,
+ rc->units,
+ rc->info,
+ 0,
+ rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
+
+ if (ae) {
+ health_alarm_log_add_entry(host, ae);
+ rc->old_status = rc->status;
+ rc->status = RRDCALC_STATUS_REMOVED;
+ rc->last_status_change = now;
+ rc->last_updated = now;
+ rc->value = NAN;
- // create an alert removed event if the chart is obsolete and
- // has stopped being collected for 60 seconds
- if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
- rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
- now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
- if (!rrdcalc_isrepeating(rc)) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- time_t now = now_realtime_sec();
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
- if (ae) {
- health_alarm_log(host, ae);
- rc->old_status = rc->status;
- rc->status = RRDCALC_STATUS_REMOVED;
- rc->last_status_change = now;
- rc->last_updated = now;
- rc->value = NAN;
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
- sql_queue_alarm_to_aclk(host, ae, 1);
+ if (netdata_cloud_setting && likely(!host->aclk_alert_reloaded))
+ sql_queue_alarm_to_aclk(host, ae, 1);
#endif
- }
}
}
+ }
- if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
- if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
- continue;
- }
-
- runnable++;
- rc->old_value = rc->value;
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
-
- // ------------------------------------------------------------
- // if there is database lookup, do it
-
- if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
- worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
-
- /* time_t old_db_timestamp = rc->db_before; */
- int value_is_null = 0;
+ if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
+ if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
+ rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
+ continue;
+ }
- int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
- rc->after, rc->before, rc->group, NULL,
- 0, rc->options,
- &rc->db_after,&rc->db_before,
- NULL, NULL, NULL,
- &value_is_null, NULL, 0, 0);
+ runnable++;
+ rc->old_value = rc->value;
+ rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
+
+ // ------------------------------------------------------------
+ // if there is database lookup, do it
+
+ if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
+ worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
+
+ /* time_t old_db_timestamp = rc->db_before; */
+ int value_is_null = 0;
+
+ int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
+ rc->after, rc->before, rc->group, NULL,
+ 0, rc->options,
+ &rc->db_after,&rc->db_before,
+ NULL, NULL, NULL,
+ &value_is_null, NULL, 0, 0,
+ QUERY_SOURCE_HEALTH);
+
+ if (unlikely(ret != 200)) {
+ // database lookup failed
+ rc->value = NAN;
+ rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
+ );
+ } else
+ rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
+
+ /* - RRDCALC_FLAG_DB_STALE not currently used
+ if (unlikely(old_db_timestamp == rc->db_before)) {
+ // database is stale
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
+
+ if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
+ rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
+ error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
+ }
+ }
+ else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
+ */
+
+ if (unlikely(value_is_null)) {
+ // collected value is null
+ rc->value = NAN;
+ rc->run_flags |= RRDCALC_FLAG_DB_NAN;
+
+ debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
+ );
+ } else
+ rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
+ );
+ }
- if (unlikely(ret != 200)) {
- // database lookup failed
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
+ // ------------------------------------------------------------
+ // if there is calculation expression, run it
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
- );
- } else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
+ if (unlikely(rc->calculation)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
- /* - RRDCALC_FLAG_DB_STALE not currently used
- if (unlikely(old_db_timestamp == rc->db_before)) {
- // database is stale
+ if (unlikely(!expression_evaluate(rc->calculation))) {
+ // calculation failed
+ rc->value = NAN;
+ rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
+ );
+ } else {
+ rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
- error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
- }
- }
- else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
- */
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
+ NETDATA_DOUBLE_FORMAT
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ rc->calculation->parsed_as, rc->calculation->result,
+ buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
+ );
- if (unlikely(value_is_null)) {
- // collected value is null
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
+ rc->value = rc->calculation->result;
+ }
+ }
+ }
+ foreach_rrdcalc_in_rrdhost_done(rc);
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
- );
- } else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
+ if (unlikely(runnable && !netdata_exit)) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
+ continue;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->value
- );
+ if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
+ continue;
}
+ RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
+ RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
- // ------------------------------------------------------------
- // if there is calculation expression, run it
+ // --------------------------------------------------------
+ // check the warning expression
- if (unlikely(rc->calculation)) {
- worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
+ if (likely(rc->warning)) {
+ worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
- if (unlikely(!expression_evaluate(rc->calculation))) {
+ if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
+ rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
- );
+ debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ buffer_tostring(rc->warning->error_msg)
+ );
} else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
+ rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->calculation->parsed_as, rc->calculation->result,
- buffer_tostring(rc->calculation->error_msg), rc->source
- );
-
- rc->value = rc->calculation->result;
-
- if (rc->local) rc->local->last_updated = now;
- if (rc->family) rc->family->last_updated = now;
- if (rc->hostid) rc->hostid->last_updated = now;
- if (rc->hostname) rc->hostname->last_updated = now;
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
+ rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
+ );
+ warning_status = rrdcalc_value2status(rc->warning->result);
}
}
- }
-
- rrdhost_unlock(host);
-
- if (unlikely(runnable && !netdata_exit)) {
- rrdhost_rdlock(host);
- for (rc = host->alarms; rc; rc = rc->next) {
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
- continue;
+ // --------------------------------------------------------
+ // check the critical expression
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
- continue;
- }
- RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
- RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
-
- // --------------------------------------------------------
- // check the warning expression
-
- if (likely(rc->warning)) {
- worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
-
- if (unlikely(!expression_evaluate(rc->warning))) {
- // calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
-
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- buffer_tostring(rc->warning->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
- NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
- );
- warning_status = rrdcalc_value2status(rc->warning->result);
- }
- }
+ if (likely(rc->critical)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
- // --------------------------------------------------------
- // check the critical expression
-
- if (likely(rc->critical)) {
- worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
-
- if (unlikely(!expression_evaluate(rc->critical))) {
- // calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
-
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- buffer_tostring(rc->critical->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
- NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
- rc->source
- );
- critical_status = rrdcalc_value2status(rc->critical->result);
- }
- }
-
- // --------------------------------------------------------
- // decide the final alarm status
-
- RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
-
- switch (warning_status) {
- case RRDCALC_STATUS_CLEAR:
- status = RRDCALC_STATUS_CLEAR;
- break;
-
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_WARNING;
- break;
+ if (unlikely(!expression_evaluate(rc->critical))) {
+ // calculation failed
+ rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
- default:
- break;
+ debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ buffer_tostring(rc->critical->error_msg)
+ );
+ } else {
+ rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
+ NETDATA_DOUBLE_FORMAT
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
+ rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
+ rrdcalc_source(rc)
+ );
+ critical_status = rrdcalc_value2status(rc->critical->result);
}
+ }
- switch (critical_status) {
- case RRDCALC_STATUS_CLEAR:
- if (status == RRDCALC_STATUS_UNDEFINED)
- status = RRDCALC_STATUS_CLEAR;
- break;
+ // --------------------------------------------------------
+ // decide the final alarm status
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_CRITICAL;
- break;
+ RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
- default:
- break;
- }
+ switch (warning_status) {
+ case RRDCALC_STATUS_CLEAR:
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- // --------------------------------------------------------
- // check if the new status and the old differ
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_WARNING;
+ break;
- if (status != rc->status) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- int delay = 0;
-
- // apply trigger hysteresis
+ default:
+ break;
+ }
- if (now > rc->delay_up_to_timestamp) {
- rc->delay_up_current = rc->delay_up_duration;
- rc->delay_down_current = rc->delay_down_duration;
- rc->delay_last = 0;
- rc->delay_up_to_timestamp = 0;
- } else {
- rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
- if (rc->delay_up_current > rc->delay_max_duration)
- rc->delay_up_current = rc->delay_max_duration;
+ switch (critical_status) {
+ case RRDCALC_STATUS_CLEAR:
+ if (status == RRDCALC_STATUS_UNDEFINED)
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
- if (rc->delay_down_current > rc->delay_max_duration)
- rc->delay_down_current = rc->delay_max_duration;
- }
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_CRITICAL;
+ break;
- if (status > rc->status)
- delay = rc->delay_up_current;
- else
- delay = rc->delay_down_current;
+ default:
+ break;
+ }
- // COMMENTED: because we do need to send raising alarms
- // if(now + delay < rc->delay_up_to_timestamp)
- // delay = (int)(rc->delay_up_to_timestamp - now);
+ // --------------------------------------------------------
+ // check if the new status and the old differ
- rc->delay_last = delay;
- rc->delay_up_to_timestamp = now + delay;
+ if (status != rc->status) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
+ int delay = 0;
+ // apply trigger hysteresis
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- health_alarm_log(host, ae);
+ if (now > rc->delay_up_to_timestamp) {
+ rc->delay_up_current = rc->delay_up_duration;
+ rc->delay_down_current = rc->delay_down_duration;
+ rc->delay_last = 0;
+ rc->delay_up_to_timestamp = 0;
+ } else {
+ rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
+ if (rc->delay_up_current > rc->delay_max_duration)
+ rc->delay_up_current = rc->delay_max_duration;
- rc->last_status_change = now;
- rc->old_status = rc->status;
- rc->status = status;
+ rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
+ if (rc->delay_down_current > rc->delay_max_duration)
+ rc->delay_down_current = rc->delay_max_duration;
}
- rc->last_updated = now;
- rc->next_update = now + rc->update_every;
-
- if (next_run > rc->next_update)
- next_run = rc->next_update;
+ if (status > rc->status)
+ delay = rc->delay_up_current;
+ else
+ delay = rc->delay_down_current;
+
+ // COMMENTED: because we do need to send raising alarms
+ // if(now + delay < rc->delay_up_to_timestamp)
+ // delay = (int)(rc->delay_up_to_timestamp - now);
+
+ rc->delay_last = delay;
+ rc->delay_up_to_timestamp = now + delay;
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->old_value,
+ rc->value,
+ rc->status,
+ status,
+ rc->source,
+ rc->units,
+ rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
+ )
+ );
+
+ health_alarm_log_add_entry(host, ae);
+
+ log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+
+ rc->last_status_change = now;
+ rc->old_status = rc->status;
+ rc->status = status;
}
- // process repeating alarms
- RRDCALC *rc;
- for(rc = host->alarms; rc ; rc = rc->next) {
- int repeat_every = 0;
- if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
- repeat_every = rc->warn_repeat_every;
- } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
- repeat_every = rc->crit_repeat_every;
- } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
- if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
- if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
- repeat_every = 1;
- } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
- repeat_every = 1;
- }
+ rc->last_updated = now;
+ rc->next_update = now + rc->update_every;
+
+ if (next_run > rc->next_update)
+ next_run = rc->next_update;
+ }
+ foreach_rrdcalc_in_rrdhost_done(rc);
+
+ // process repeating alarms
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ int repeat_every = 0;
+ if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
+ rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
+ repeat_every = rc->warn_repeat_every;
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
+ rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
+ repeat_every = rc->crit_repeat_every;
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
+ if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
+ if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
+ repeat_every = 1;
+ } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
+ repeat_every = 1;
}
}
- } else {
- continue;
}
+ } else {
+ continue;
+ }
- if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- rc->last_repeat = now;
- if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- ae->last_repeat = rc->last_repeat;
- if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
- ae->flags |= HEALTH_ENTRY_RUN_ONCE;
- }
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
- health_process_notifications(host, ae);
- debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
- health_alarm_wait_for_execution(ae);
- health_alarm_log_free_one_nochecks_nounlink(ae);
+ if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
+ rc->last_repeat = now;
+ if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->old_value,
+ rc->value,
+ rc->old_status,
+ rc->status,
+ rc->source,
+ rc->units,
+ rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
+ )
+ );
+
+ ae->last_repeat = rc->last_repeat;
+ if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
+ ae->flags |= HEALTH_ENTRY_RUN_ONCE;
}
+ rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
+ health_process_notifications(host, ae);
+ debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ health_alarm_wait_for_execution(ae);
+ health_alarm_log_free_one_nochecks_nounlink(ae);
}
-
- rrdhost_unlock(host);
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
+ }
- if (unlikely(netdata_exit))
- break;
+ if (unlikely(netdata_exit))
+ break;
- // execute notifications
- // and cleanup
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
- health_alarm_log_process(host);
+ // execute notifications
+ // and cleanup
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
+ health_alarm_log_process(host);
- if (unlikely(netdata_exit)) {
- // wait for all notifications to finish before allowing health to be cleaned up
- ALARM_ENTRY *ae;
- while (NULL != (ae = alarm_notifications_in_progress.head)) {
- health_alarm_wait_for_execution(ae);
- }
- break;
+ if (unlikely(netdata_exit)) {
+ // wait for all notifications to finish before allowing health to be cleaned up
+ ALARM_ENTRY *ae;
+ while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ health_alarm_wait_for_execution(ae);
}
-
- } /* rrdhost_foreach */
+ break;
+ }
// wait for all notifications to finish before allowing health to be cleaned up
ALARM_ENTRY *ae;
@@ -1215,34 +1533,49 @@ void *health_main(void *ptr) {
}
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
- rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
- continue;
- sql_queue_removed_alerts_to_aclk(host);
- }
- aclk_alert_reloaded = 0;
- marked_aclk_reload_loop = 0;
- }
+ if (netdata_cloud_setting && unlikely(host->aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
+ sql_queue_removed_alerts_to_aclk(host);
+ host->aclk_alert_reloaded = 0;
+ marked_aclk_reload_loop = 0;
+ }
#endif
- rrd_unlock();
-
if(unlikely(netdata_exit))
break;
- now = now_realtime_sec();
- if(now < next_run) {
- worker_is_idle();
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
- sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
- now = now_realtime_sec();
- }
- else
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+ health_sleep(next_run, loop, host);
} // forever
netdata_thread_cleanup_pop(1);
return NULL;
}
+
+void health_add_host_labels(void) {
+ DICTIONARY *labels = localhost->rrdlabels;
+
+ int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO);
+ rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_CONFIG);
+
+ int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO);
+ rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_CONFIG);
+}
+
+void health_thread_spawn(RRDHOST * host) {
+ if(!host->health_spawn) {
+ char tag[NETDATA_THREAD_TAG_MAX + 1];
+ snprintfz(tag, NETDATA_THREAD_TAG_MAX, "HEALTH[%s]", rrdhost_hostname(host));
+ struct health_state *health = callocz(1, sizeof(*health));
+ health->host = host;
+
+ if(netdata_thread_create(&host->health_thread, tag, NETDATA_THREAD_OPTION_JOINABLE, health_main, (void *) health)) {
+ log_health("[%s]: Failed to create new thread for client.", rrdhost_hostname(host));
+ error("HEALTH [%s]: Failed to create new thread for client.", rrdhost_hostname(host));
+ }
+ else {
+ log_health("[%s]: Created new thread for client.", rrdhost_hostname(host));
+ host->health_spawn = 1;
+ host->aclk_alert_reloaded = 1;
+ }
+ }
+}