diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 1431 |
1 files changed, 882 insertions, 549 deletions
diff --git a/health/health.c b/health/health.c index 9eb36a9c6..3784e0f31 100644 --- a/health/health.c +++ b/health/health.c @@ -2,11 +2,166 @@ #include "health.h" +#define WORKER_HEALTH_JOB_RRD_LOCK 0 +#define WORKER_HEALTH_JOB_HOST_LOCK 1 +#define WORKER_HEALTH_JOB_DB_QUERY 2 +#define WORKER_HEALTH_JOB_CALC_EVAL 3 +#define WORKER_HEALTH_JOB_WARNING_EVAL 4 +#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 +#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 +#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 +#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8 +#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10 +#endif + +static bool prepare_command(BUFFER *wb, + const char *exec, + const char *recipient, + const char *registry_hostname, + uint32_t unique_id, + uint32_t alarm_id, + uint32_t alarm_event_id, + uint32_t when, + const char *alert_name, + const char *alert_chart_name, + const char *alert_family, + const char *new_status, + const char *old_status, + NETDATA_DOUBLE new_value, + NETDATA_DOUBLE old_value, + const char *alert_source, + uint32_t duration, + uint32_t non_clear_duration, + const char *alert_units, + const char *alert_info, + const char *new_value_string, + const char *old_value_string, + const char *source, + const char *error_msg, + int n_warn, + int n_crit, + const char *warn_alarms, + const char *crit_alarms, + const char *classification, + const char *edit_command, + const char *machine_guid) +{ + char buf[8192]; + size_t n = 8192 - 1; + + buffer_strcat(wb, "exec"); + + if (!sanitize_command_argument_string(buf, exec, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, recipient, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, registry_hostname, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%u'", unique_id); + + buffer_sprintf(wb, " '%u'", alarm_id); + + buffer_sprintf(wb, " '%u'", alarm_event_id); + + buffer_sprintf(wb, " '%u'", when); + + if (!sanitize_command_argument_string(buf, alert_name, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_chart_name, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_family, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, new_status, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, old_status, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value); + + buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value); + + if (!sanitize_command_argument_string(buf, alert_source, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%u'", duration); + + buffer_sprintf(wb, " '%u'", non_clear_duration); + + if (!sanitize_command_argument_string(buf, alert_units, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_info, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, new_value_string, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, old_value_string, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, source, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, error_msg, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%d'", n_warn); + + buffer_sprintf(wb, " '%d'", n_crit); + + if (!sanitize_command_argument_string(buf, warn_alarms, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, crit_alarms, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, classification, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, edit_command, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, machine_guid, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + return true; +} + unsigned int default_health_enabled = 1; char *silencers_filename; // the queue of executed alarm notifications that haven't been waited for yet -static struct { +static __thread struct { ALARM_ENTRY *head; // oldest ALARM_ENTRY *tail; // latest } alarm_notifications_in_progress = {NULL, NULL}; @@ -146,77 +301,51 @@ void health_init(void) { * @param host the structure of the host that the function will reload the configuration. */ static void health_reload_host(RRDHOST *host) { - if(unlikely(!host->health_enabled)) + if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; + log_health("[%s]: Reloading health.", rrdhost_hostname(host)); + char *user_path = health_user_config_dir(); char *stock_path = health_stock_config_dir(); // free all running alarms - rrdhost_wrlock(host); - - while(host->templates) - rrdcalctemplate_unlink_and_free(host, host->templates); - - RRDCALCTEMPLATE *rt,*next; - for(rt = host->alarms_template_with_foreach; rt ; rt = next) { - next = rt->next; - rrdcalctemplate_free(rt); - } - host->alarms_template_with_foreach = NULL; - - while(host->alarms) - rrdcalc_unlink_and_free(host, host->alarms); - - RRDCALC *rc,*nc; - for(rc = host->alarms_with_foreach; rc ; rc = nc) { - nc = rc->next; - rrdcalc_free(rc); - } - host->alarms_with_foreach = NULL; - - rrdhost_unlock(host); + rrdcalc_delete_all(host); + rrdcalctemplate_delete_all(host); // invalidate all previous entries in the alarm log + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *t; for(t = host->health_log.alarms ; t ; t = t->next) { if(t->new_status != RRDCALC_STATUS_REMOVED) t->flags |= HEALTH_ENTRY_FLAG_UPDATED; } + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - rrdhost_rdlock(host); // reset all thresholds to all charts RRDSET *st; rrdset_foreach_read(st, host) { st->green = NAN; st->red = NAN; } - rrdhost_unlock(host); + rrdset_foreach_done(st); // load the new alarms - rrdhost_wrlock(host); health_readdir(host, user_path, stock_path, NULL); //Discard alarms with labels that do not apply to host - rrdcalc_labels_unlink_alarm_from_host(host); + rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); // link the loaded alarms to their charts - RRDDIM *rd; rrdset_foreach_write(st, host) { if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) continue; - rrdsetcalc_link_matching(st); - rrdcalctemplate_link_matching(st); - //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it. - rrdset_rdlock(st); - rrddim_foreach_read(rd, st) { - rrdcalc_link_to_rrddim(rd, st, host); - } - rrdset_unlock(st); + rrdcalc_link_matching_alerts_to_rrdset(st); + rrdcalctemplate_link_matching_templates_to_rrdset(st); } - - rrdhost_unlock(host); + rrdset_foreach_done(st); + host->aclk_alert_reloaded = 1; } /** @@ -234,11 +363,6 @@ void health_reload(void) { health_reload_host(host); rrd_unlock(); -#ifdef ENABLE_ACLK - if (netdata_cloud_setting) { - aclk_alert_reloaded = 1; - } -#endif } // ---------------------------------------------------------------------------- @@ -250,7 +374,6 @@ static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) { return RRDCALC_STATUS_CLEAR; } -#define ALARM_EXEC_COMMAND_LENGTH 8192 #define ACTIVE_ALARMS_LIST_EXAMINE 500 #define ACTIVE_ALARMS_LIST 15 @@ -266,13 +389,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { // do not send notifications for internal statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { // do not send notifications for disabled statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); // mark it as run, so that we will send the same alarm if it happens again goto done; } @@ -292,7 +416,9 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // we have executed this alarm notification in the past if(t && t->new_status == ae->new_status) { // don't send the notification for the same status again - debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name + debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) + , rrdcalc_status2string(ae->new_status)); + log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); goto done; } @@ -303,7 +429,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" - , ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } } @@ -312,14 +438,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // Check if alarm notifications are silenced if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { - info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } - static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; + log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); - const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; - const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; + const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec); + const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient); int n_warn=0, n_crit=0; RRDCALC *rc; @@ -330,13 +456,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); - for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) { + foreach_rrdcalc_in_rrdhost_read(host, rc) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE)) + break; + if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc); active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; active_alerts[n_warn+n_crit].status = rc->status; n_warn++; @@ -344,7 +473,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc); active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; active_alerts[n_warn+n_crit].status = rc->status; n_crit++; @@ -355,6 +484,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { expr = rc->warning; } } + foreach_rrdcalc_in_rrdhost_done(rc); if (n_warn+n_crit>1) qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts); @@ -379,51 +509,55 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } - char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); - - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO - "' '" NETDATA_DOUBLE_FORMAT_ZERO - "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", - exec, - recipient, - host->registry_hostname, - ae->unique_id, - ae->alarm_id, - ae->alarm_event_id, - (unsigned long)ae->when, - ae->name, - ae->chart?ae->chart:"NOCHART", - ae->family?ae->family:"NOFAMILY", - rrdcalc_status2string(ae->new_status), - rrdcalc_status2string(ae->old_status), - ae->new_value, - ae->old_value, - ae->source?ae->source:"UNKNOWN", - (uint32_t)ae->duration, - (uint32_t)ae->non_clear_duration, - ae->units?ae->units:"", - ae->info?ae->info:"", - ae->new_value_string, - ae->old_value_string, - (expr && expr->source)?expr->source:"NOSOURCE", - (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", - n_warn, - n_crit, - buffer_tostring(warn_alarms), - buffer_tostring(crit_alarms), - ae->classification?ae->classification:"Unknown", - edit_command, - host != localhost ? host->machine_guid:"" - ); - - ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; - ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ - - debug(D_HEALTH, "executing command '%s'", command_to_run); - ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; - ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); - enqueue_alarm_notify_in_progress(ae); + char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); + + BUFFER *wb = buffer_create(8192); + bool ok = prepare_command(wb, + exec, + recipient, + rrdhost_registry_hostname(host), + ae->unique_id, + ae->alarm_id, + ae->alarm_event_id, + (unsigned long)ae->when, + ae_name(ae), + ae->chart?ae_chart_name(ae):"NOCHART", + ae->family?ae_family(ae):"NOFAMILY", + rrdcalc_status2string(ae->new_status), + rrdcalc_status2string(ae->old_status), + ae->new_value, + ae->old_value, + ae->source?ae_source(ae):"UNKNOWN", + (uint32_t)ae->duration, + (uint32_t)ae->non_clear_duration, + ae_units(ae), + ae_info(ae), + ae_new_value_string(ae), + ae_old_value_string(ae), + (expr && expr->source)?expr->source:"NOSOURCE", + (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", + n_warn, + n_crit, + buffer_tostring(warn_alarms), + buffer_tostring(crit_alarms), + ae->classification?ae_classification(ae):"Unknown", + edit_command, + host != localhost ? host->machine_guid:""); + + const char *command_to_run = buffer_tostring(wb); + if (ok) { + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; + ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ + + debug(D_HEALTH, "executing command '%s'", command_to_run); + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; + ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); + enqueue_alarm_notify_in_progress(ae); + } else { + error("Failed to format command arguments"); + } + buffer_free(wb); freez(edit_command); buffer_free(warn_alarms); buffer_free(crit_alarms); @@ -450,7 +584,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", - ae->chart?ae->chart:"NOCHART", ae->name, + ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae), ae->new_value, rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status) @@ -467,7 +601,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { - if(likely(!alarm_entry_isrepeating(host, ae))) { + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { if(unlikely( !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) @@ -481,13 +615,13 @@ static inline void health_alarm_log_process(RRDHOST *host) { } } + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + // remember this for the next iteration host->health_last_processed_id = first_waiting; bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - if (!cleanup_excess_log_entries) return; @@ -508,7 +642,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; - if(likely(!alarm_entry_isrepeating(host, ae))) { + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); host->health_log.count--; @@ -522,7 +656,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { if(unlikely(!rc->rrdset)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -533,40 +667,38 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) *next_run = rc->next_update; } - debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now)); + debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); return 0; } if(unlikely(!rc->update_every)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } int update_every = rc->rrdset->update_every; - rrdset_rdlock(rc->rrdset); - time_t first = rrdset_first_entry_t_nolock(rc->rrdset); - time_t last = rrdset_last_entry_t_nolock(rc->rrdset); - rrdset_unlock(rc->rrdset); + time_t first = rrdset_first_entry_t(rc->rrdset); + time_t last = rrdset_last_entry_t(rc->rrdset); if(unlikely(now + update_every < first /* || now - update_every > last */)) { debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)." - , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first , (unsigned long) last); return 0; } @@ -577,7 +709,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) if(needed + update_every < first || needed - update_every > last) { debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." - , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first , (unsigned long) last); return 0; } @@ -587,7 +719,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) } static inline int check_if_resumed_from_suspension(void) { - static usec_t last_realtime = 0, last_monotonic = 0; + static __thread usec_t last_realtime = 0, last_monotonic = 0; usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec(); int ret = 0; @@ -603,41 +735,142 @@ static inline int check_if_resumed_from_suspension(void) { return ret; } -static void health_main_cleanup(void *ptr) { +static void health_thread_cleanup(void *ptr) { worker_unregister(); - struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; - static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; + struct health_state *h = ptr; + h->host->health_spawn = 0; + + netdata_thread_cancel(netdata_thread_self()); + log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host)); + debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host)); +} + +static void initialize_health(RRDHOST *host, int is_localhost) { + if(!host->health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; + rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); + + log_health("[%s]: Initializing health.", rrdhost_hostname(host)); + + host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); + host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); + + host->health_log.next_log_id = 1; + host->health_log.next_alarm_id = 1; + host->health_log.max = 1000; + host->health_log.next_log_id = (uint32_t)now_realtime_sec(); + host->health_log.next_alarm_id = 0; + + long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); + if(n < 10) { + error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); + config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); + } + else + host->health_log.max = (unsigned int)n; + + netdata_rwlock_init(&host->health_log.alarm_log_rwlock); + + char filename[FILENAME_MAX + 1]; + + if(!is_localhost) { + int r = mkdir(host->varlib_dir, 0775); + if (r != 0 && errno != EEXIST) + error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir); + } + + { + snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir); + int r = mkdir(filename, 0775); + if(r != 0 && errno != EEXIST) + error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename); + } + snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir); + host->health_log_filename = strdupz(filename); + + snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir); + host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); + host->health_default_recipient = string_strdupz("root"); + + if (!file_is_migrated(host->health_log_filename)) { + int rc = sql_create_health_log_table(host); + if (unlikely(rc)) { + log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host)); + health_alarm_log_load(host); + health_alarm_log_open(host); + } + else { + health_alarm_log_load(host); + add_migrated_file(host->health_log_filename, 0); + } + } else { + // TODO: This needs to go to the metadata thread + // Health should wait before accessing the table (needs to be created by the metadata thread) + sql_create_health_log_table(host); + sql_health_alarm_log_load(host); + } + + // ------------------------------------------------------------------------ + // load health configuration + + health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL); - info("cleaning up..."); + // link the loaded alarms to their charts + RRDSET *st; + rrdset_foreach_write(st, host) { + if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) + continue; + + rrdcalc_link_matching_alerts_to_rrdset(st); + rrdcalctemplate_link_matching_templates_to_rrdset(st); + } + rrdset_foreach_done(st); - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + //Discard alarms with labels that do not apply to host + rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); + + health_silencers_init(); +} + +static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) { + time_t now = now_realtime_sec(); + if(now < next_run) { + worker_is_idle(); + debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); + while (now < next_run && host->health_enabled && !netdata_exit) { + sleep_usec(USEC_PER_SEC); + now = now_realtime_sec(); + } + } + else { + debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + } } -static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { +static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) { SILENCER *s; debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", - rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); + rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):""); for (s = silencers->silencers; s!=NULL; s=s->next){ if ( - (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) && - (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) && + (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) && (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) && - (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) && - (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family))) + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) && + (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset)))) ) { debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); if (unlikely(silencers->stype == STYPE_NONE)) { - debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); + debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" - , rc->name - , (rc->rrdset)?rc->rrdset->context:"" - , rc->chart + , rrdcalc_name(rc) + , (rc->rrdset)?rrdset_context(rc->rrdset):"" + , rrdcalc_chart_name(rc) , host - , (rc->rrdset)?rc->rrdset->family:"" + , (rc->rrdset)?rrdset_family(rc->rrdset):"" ); } return silencers->stype; @@ -657,66 +890,86 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise */ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { - uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; + uint32_t rrdcalc_flags_old = rc->run_flags; // Clear the flags - rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); + rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); if (unlikely(silencers->all_alarms)) { - if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; - else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED; + else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED; } else { - SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers); - if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; - else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers); + if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED; + else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED; } - if (rrdcalc_flags_old != rc->rrdcalc_flags) { + if (rrdcalc_flags_old != rc->run_flags) { info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", - host->hostname, - rc->name, + rrdhost_hostname(host), + rrdcalc_name(rc), (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", - (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false", + (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false", (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false", - (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false" + (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false" ); } - if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) + if (rc->run_flags & RRDCALC_FLAG_DISABLED) return 1; else return 0; } -// Create alarms for dimensions that have been added to charts -// since the previous iteration. -static void init_pending_foreach_alarms(RRDHOST *host) { +static void health_execute_delayed_initializations(RRDHOST *host) { RRDSET *st; - RRDDIM *rd; - if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) - return; + if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return; + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); - rrdhost_wrlock(host); + rrdset_foreach_reentrant(st, host) { + if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue; + rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION); - rrdset_foreach_write(st, host) { - if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) - continue; + worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET); + + if(!st->rrdfamily) + st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st)); + + if(!st->rrdvars) + st->rrdvars = rrdvariables_create(); + + rrddimvar_index_init(st); - rrdset_rdlock(st); + rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE); + rrdcalc_link_matching_alerts_to_rrdset(st); + rrdcalctemplate_link_matching_templates_to_rrdset(st); + + RRDDIM *rd; rrddim_foreach_read(rd, st) { - if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) - continue; + if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue; + rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION); - rrdcalc_link_to_rrddim(rd, st, host); + worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM); - rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM); - } + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); - rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS); - rrdset_unlock(st); - } + RRDCALCTEMPLATE *rt; + foreach_rrdcalctemplate_read(host, rt) { + if(!rt->foreach_dimension_pattern) + continue; - rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS); - rrdhost_unlock(host); + if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) + rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host); + } + foreach_rrdcalctemplate_done(rt); + } + rrddim_foreach_done(rd); + } + rrdset_foreach_done(st); } /** @@ -729,19 +982,6 @@ static void init_pending_foreach_alarms(RRDHOST *host) { * @return It always returns NULL */ -#define WORKER_HEALTH_JOB_RRD_LOCK 0 -#define WORKER_HEALTH_JOB_HOST_LOCK 1 -#define WORKER_HEALTH_JOB_DB_QUERY 2 -#define WORKER_HEALTH_JOB_CALC_EVAL 3 -#define WORKER_HEALTH_JOB_WARNING_EVAL 4 -#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 -#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 -#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 - -#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8 -#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8 -#endif - void *health_main(void *ptr) { worker_register("HEALTH"); worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock"); @@ -752,8 +992,14 @@ void *health_main(void *ptr) { worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval"); worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry"); worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process"); + worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init"); + worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init"); - netdata_thread_cleanup_push(health_main_cleanup, ptr); + struct health_state *h = ptr; + netdata_thread_cleanup_push(health_thread_cleanup, ptr); + + RRDHOST *host = h->host; + initialize_health(host, host == localhost); int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); if(min_run_every < 1) min_run_every = 1; @@ -763,16 +1009,21 @@ void *health_main(void *ptr) { time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); - rrdcalc_labels_unlink(); + bool health_running_logged = false; + + rrdhost_rdlock(host); //CHECK + rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); + rrdhost_unlock(host); unsigned int loop = 0; #ifdef ENABLE_ACLK unsigned int marked_aclk_reload_loop = 0; #endif - while(!netdata_exit) { + while(!netdata_exit && host->health_enabled) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); + now = now_realtime_sec(); int runnable = 0, apply_hibernation_delay = 0; time_t next_run = now + min_run_every; RRDCALC *rc; @@ -780,433 +1031,500 @@ void *health_main(void *ptr) { if (unlikely(check_if_resumed_from_suspension())) { apply_hibernation_delay = 1; - info( - "Postponing alarm checks for %"PRId64" seconds, " - "because it seems that the system was just resumed from suspension.", - (int64_t)hibernation_delay); + log_health( + "[%s]: Postponing alarm checks for %"PRId64" seconds, " + "because it seems that the system was just resumed from suspension.", + rrdhost_hostname(host), + (int64_t)hibernation_delay); } if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { - static int logged=0; + static __thread int logged=0; if (!logged) { - info("Skipping health checks, because all alarms are disabled via a %s command.", - HEALTH_CMDAPI_CMD_DISABLEALL); + log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.", + rrdhost_hostname(host), + HEALTH_CMDAPI_CMD_DISABLEALL); logged = 1; } } #ifdef ENABLE_ACLK - if (aclk_alert_reloaded && !marked_aclk_reload_loop) + if (host->aclk_alert_reloaded && !marked_aclk_reload_loop) marked_aclk_reload_loop = loop; #endif - worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); - rrd_rdlock(); + if (unlikely(apply_hibernation_delay)) { + log_health( + "[%s]: Postponing health checks for %"PRId64" seconds.", + rrdhost_hostname(host), + (int64_t)hibernation_delay); - RRDHOST *host; - rrdhost_foreach_read(host) { - if (unlikely(!host->health_enabled)) + host->health_delay_up_to = now + hibernation_delay; + next_run = now + hibernation_delay; + health_sleep(next_run, loop, host); + } + + if (unlikely(host->health_delay_up_to)) { + if (unlikely(now < host->health_delay_up_to)) { + next_run = host->health_delay_up_to; + health_sleep(next_run, loop, host); continue; + } - if (unlikely(apply_hibernation_delay)) { - info( - "Postponing health checks for %"PRId64" seconds, on host '%s'.", - (int64_t)hibernation_delay, - host->hostname); + log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); + host->health_delay_up_to = 0; + } - host->health_delay_up_to = now + hibernation_delay; + // wait until cleanup of obsolete charts on children is complete + if (host != localhost) { + if (unlikely(host->trigger_chart_obsoletion_check == 1)) { + log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); + health_sleep(next_run, loop, host); + continue; } + } - if (unlikely(host->health_delay_up_to)) { - if (unlikely(now < host->health_delay_up_to)) - continue; + if (!health_running_logged) { + log_health("[%s]: Health is running.", rrdhost_hostname(host)); + health_running_logged = true; + } - info("Resuming health checks on host '%s'.", host->hostname); - host->health_delay_up_to = 0; - } + if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) + sql_health_alarm_log_cleanup(host); - // wait until cleanup of obsolete charts on children is complete - if (host != localhost) - if (unlikely(host->trigger_chart_obsoletion_check == 1)) - continue; + health_execute_delayed_initializations(host); - if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) - sql_health_alarm_log_cleanup(host); + worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); - init_pending_foreach_alarms(host); + // the first loop is to lookup values from the db + foreach_rrdcalc_in_rrdhost_read(host, rc) { - worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); - rrdhost_rdlock(host); + rrdcalc_update_info_using_rrdset_labels(rc); - // the first loop is to lookup values from the db - for (rc = host->alarms; rc; rc = rc->next) { + if (update_disabled_silenced(host, rc)) + continue; - if (update_disabled_silenced(host, rc)) - continue; + // create an alert removed event if the chart is obsolete and + // has stopped being collected for 60 seconds + if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED && + rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && + now > (rc->rrdset->last_collected_time.tv_sec + 60))) { + if (!rrdcalc_isrepeating(rc)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + time_t now = now_realtime_sec(); + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->value, + NAN, + rc->status, + RRDCALC_STATUS_REMOVED, + rc->source, + rc->units, + rc->info, + 0, + rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); + + if (ae) { + health_alarm_log_add_entry(host, ae); + rc->old_status = rc->status; + rc->status = RRDCALC_STATUS_REMOVED; + rc->last_status_change = now; + rc->last_updated = now; + rc->value = NAN; - // create an alert removed event if the chart is obsolete and - // has stopped being collected for 60 seconds - if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED && - rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && - now > (rc->rrdset->last_collected_time.tv_sec + 60))) { - if (!rrdcalc_isrepeating(rc)) { - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); - time_t now = now_realtime_sec(); - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0); - if (ae) { - health_alarm_log(host, ae); - rc->old_status = rc->status; - rc->status = RRDCALC_STATUS_REMOVED; - rc->last_status_change = now; - rc->last_updated = now; - rc->value = NAN; #ifdef ENABLE_ACLK - if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) - sql_queue_alarm_to_aclk(host, ae, 1); + if (netdata_cloud_setting && likely(!host->aclk_alert_reloaded)) + sql_queue_alarm_to_aclk(host, ae, 1); #endif - } } } + } - if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { - if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; - continue; - } - - runnable++; - rc->old_value = rc->value; - rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - - // ------------------------------------------------------------ - // if there is database lookup, do it - - if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { - worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); - - /* time_t old_db_timestamp = rc->db_before; */ - int value_is_null = 0; + if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { + if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE)) + rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE; + continue; + } - int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, - rc->after, rc->before, rc->group, NULL, - 0, rc->options, - &rc->db_after,&rc->db_before, - NULL, NULL, NULL, - &value_is_null, NULL, 0, 0); + runnable++; + rc->old_value = rc->value; + rc->run_flags |= RRDCALC_FLAG_RUNNABLE; + + // ------------------------------------------------------------ + // if there is database lookup, do it + + if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); + + /* time_t old_db_timestamp = rc->db_before; */ + int value_is_null = 0; + + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1, + rc->after, rc->before, rc->group, NULL, + 0, rc->options, + &rc->db_after,&rc->db_before, + NULL, NULL, NULL, + &value_is_null, NULL, 0, 0, + QUERY_SOURCE_HEALTH); + + if (unlikely(ret != 200)) { + // database lookup failed + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_DB_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret + ); + } else + rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR; + + /* - RRDCALC_FLAG_DB_STALE not currently used + if (unlikely(old_db_timestamp == rc->db_before)) { + // database is stale + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; + error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + } + } + else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; + */ + + if (unlikely(value_is_null)) { + // collected value is null + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc) + ); + } else + rc->run_flags &= ~RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value + ); + } - if (unlikely(ret != 200)) { - // database lookup failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + // ------------------------------------------------------------ + // if there is calculation expression, run it - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret - ); - } else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; + if (unlikely(rc->calculation)) { + worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); - /* - RRDCALC_FLAG_DB_STALE not currently used - if (unlikely(old_db_timestamp == rc->db_before)) { - // database is stale + if (unlikely(!expression_evaluate(rc->calculation))) { + // calculation failed + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_CALC_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) + ); + } else { + rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR; - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; - error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - } - } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; - */ + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + NETDATA_DOUBLE_FORMAT + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + rc->calculation->parsed_as, rc->calculation->result, + buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc) + ); - if (unlikely(value_is_null)) { - // collected value is null - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; + rc->value = rc->calculation->result; + } + } + } + foreach_rrdcalc_in_rrdhost_done(rc); - debug(D_HEALTH, - "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name - ); - } else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; + if (unlikely(runnable && !netdata_exit)) { + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE))) + continue; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->value - ); + if (rc->run_flags & RRDCALC_FLAG_DISABLED) { + continue; } + RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; + RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; - // ------------------------------------------------------------ - // if there is calculation expression, run it + // -------------------------------------------------------- + // check the warning expression - if (unlikely(rc->calculation)) { - worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); + if (likely(rc->warning)) { + worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); - if (unlikely(!expression_evaluate(rc->calculation))) { + if (unlikely(!expression_evaluate(rc->warning))) { // calculation failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + rc->run_flags |= RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) - ); + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + buffer_tostring(rc->warning->error_msg) + ); } else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->calculation->parsed_as, rc->calculation->result, - buffer_tostring(rc->calculation->error_msg), rc->source - ); - - rc->value = rc->calculation->result; - - if (rc->local) rc->local->last_updated = now; - if (rc->family) rc->family->last_updated = now; - if (rc->hostid) rc->hostid->last_updated = now; - if (rc->hostname) rc->hostname->last_updated = now; + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), + rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc) + ); + warning_status = rrdcalc_value2status(rc->warning->result); } } - } - - rrdhost_unlock(host); - - if (unlikely(runnable && !netdata_exit)) { - rrdhost_rdlock(host); - for (rc = host->alarms; rc; rc = rc->next) { - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) - continue; + // -------------------------------------------------------- + // check the critical expression - if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) { - continue; - } - RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; - RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; - - // -------------------------------------------------------- - // check the warning expression - - if (likely(rc->warning)) { - worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); - - if (unlikely(!expression_evaluate(rc->warning))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - - debug(D_HEALTH, - "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - buffer_tostring(rc->warning->error_msg) - ); - } else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " - NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", - rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source - ); - warning_status = rrdcalc_value2status(rc->warning->result); - } - } + if (likely(rc->critical)) { + worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); - // -------------------------------------------------------- - // check the critical expression - - if (likely(rc->critical)) { - worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); - - if (unlikely(!expression_evaluate(rc->critical))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - - debug(D_HEALTH, - "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - buffer_tostring(rc->critical->error_msg) - ); - } else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " - NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", - rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), - rc->source - ); - critical_status = rrdcalc_value2status(rc->critical->result); - } - } - - // -------------------------------------------------------- - // decide the final alarm status - - RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; - - switch (warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; - - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; + if (unlikely(!expression_evaluate(rc->critical))) { + // calculation failed + rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR; - default: - break; + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + buffer_tostring(rc->critical->error_msg) + ); + } else { + rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " + NETDATA_DOUBLE_FORMAT + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), + rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg), + rrdcalc_source(rc) + ); + critical_status = rrdcalc_value2status(rc->critical->result); } + } - switch (critical_status) { - case RRDCALC_STATUS_CLEAR: - if (status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; + // -------------------------------------------------------- + // decide the final alarm status - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; + RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; - default: - break; - } + switch (warning_status) { + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; - // -------------------------------------------------------- - // check if the new status and the old differ + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; - if (status != rc->status) { - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); - int delay = 0; - - // apply trigger hysteresis + default: + break; + } - if (now > rc->delay_up_to_timestamp) { - rc->delay_up_current = rc->delay_up_duration; - rc->delay_down_current = rc->delay_down_duration; - rc->delay_last = 0; - rc->delay_up_to_timestamp = 0; - } else { - rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); - if (rc->delay_up_current > rc->delay_max_duration) - rc->delay_up_current = rc->delay_max_duration; + switch (critical_status) { + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; - rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); - if (rc->delay_down_current > rc->delay_max_duration) - rc->delay_down_current = rc->delay_max_duration; - } + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; - if (status > rc->status) - delay = rc->delay_up_current; - else - delay = rc->delay_down_current; + default: + break; + } - // COMMENTED: because we do need to send raising alarms - // if(now + delay < rc->delay_up_to_timestamp) - // delay = (int)(rc->delay_up_to_timestamp - now); + // -------------------------------------------------------- + // check if the new status and the old differ - rc->delay_last = delay; - rc->delay_up_to_timestamp = now + delay; + if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + int delay = 0; + // apply trigger hysteresis - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - ); - health_alarm_log(host, ae); + if (now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->delay_up_duration; + rc->delay_down_current = rc->delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } else { + rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); + if (rc->delay_up_current > rc->delay_max_duration) + rc->delay_up_current = rc->delay_max_duration; - rc->last_status_change = now; - rc->old_status = rc->status; - rc->status = status; + rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); + if (rc->delay_down_current > rc->delay_max_duration) + rc->delay_down_current = rc->delay_max_duration; } - rc->last_updated = now; - rc->next_update = now + rc->update_every; - - if (next_run > rc->next_update) - next_run = rc->next_update; + if (status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + // COMMENTED: because we do need to send raising alarms + // if(now + delay < rc->delay_up_to_timestamp) + // delay = (int)(rc->delay_up_to_timestamp - now); + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + status, + rc->source, + rc->units, + rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) + ) + ); + + health_alarm_log_add_entry(host, ae); + + log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; } - // process repeating alarms - RRDCALC *rc; - for(rc = host->alarms; rc ; rc = rc->next) { - int repeat_every = 0; - if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { - if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE; - repeat_every = rc->warn_repeat_every; - } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE; - repeat_every = rc->crit_repeat_every; - } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { - if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) { - if(rc->old_status == RRDCALC_STATUS_CRITICAL) { - repeat_every = 1; - } else if (rc->old_status == RRDCALC_STATUS_WARNING) { - repeat_every = 1; - } + rc->last_updated = now; + rc->next_update = now + rc->update_every; + + if (next_run > rc->next_update) + next_run = rc->next_update; + } + foreach_rrdcalc_in_rrdhost_done(rc); + + // process repeating alarms + foreach_rrdcalc_in_rrdhost_read(host, rc) { + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE; + repeat_every = rc->warn_repeat_every; + } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { + rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE; + repeat_every = rc->crit_repeat_every; + } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { + if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) { + if(rc->old_status == RRDCALC_STATUS_CRITICAL) { + repeat_every = 1; + } else if (rc->old_status == RRDCALC_STATUS_WARNING) { + repeat_every = 1; } } - } else { - continue; } + } else { + continue; + } - if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); - rc->last_repeat = now; - if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - ); - ae->last_repeat = rc->last_repeat; - if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { - ae->flags |= HEALTH_ENTRY_RUN_ONCE; - } - rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE; - health_process_notifications(host, ae); - debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); - health_alarm_wait_for_execution(ae); - health_alarm_log_free_one_nochecks_nounlink(ae); + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + rc->last_repeat = now; + if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->old_status, + rc->status, + rc->source, + rc->units, + rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) + ) + ); + + ae->last_repeat = rc->last_repeat; + if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { + ae->flags |= HEALTH_ENTRY_RUN_ONCE; } + rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; + health_process_notifications(host, ae); + debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_wait_for_execution(ae); + health_alarm_log_free_one_nochecks_nounlink(ae); } - - rrdhost_unlock(host); } + foreach_rrdcalc_in_rrdhost_done(rc); + } - if (unlikely(netdata_exit)) - break; + if (unlikely(netdata_exit)) + break; - // execute notifications - // and cleanup - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); - health_alarm_log_process(host); + // execute notifications + // and cleanup + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); + health_alarm_log_process(host); - if (unlikely(netdata_exit)) { - // wait for all notifications to finish before allowing health to be cleaned up - ALARM_ENTRY *ae; - while (NULL != (ae = alarm_notifications_in_progress.head)) { - health_alarm_wait_for_execution(ae); - } - break; + if (unlikely(netdata_exit)) { + // wait for all notifications to finish before allowing health to be cleaned up + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + health_alarm_wait_for_execution(ae); } - - } /* rrdhost_foreach */ + break; + } // wait for all notifications to finish before allowing health to be cleaned up ALARM_ENTRY *ae; @@ -1215,34 +1533,49 @@ void *health_main(void *ptr) { } #ifdef ENABLE_ACLK - if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { - rrdhost_foreach_read(host) { - if (unlikely(!host->health_enabled)) - continue; - sql_queue_removed_alerts_to_aclk(host); - } - aclk_alert_reloaded = 0; - marked_aclk_reload_loop = 0; - } + if (netdata_cloud_setting && unlikely(host->aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { + sql_queue_removed_alerts_to_aclk(host); + host->aclk_alert_reloaded = 0; + marked_aclk_reload_loop = 0; + } #endif - rrd_unlock(); - if(unlikely(netdata_exit)) break; - now = now_realtime_sec(); - if(now < next_run) { - worker_is_idle(); - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); - sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); - now = now_realtime_sec(); - } - else - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + health_sleep(next_run, loop, host); } // forever netdata_thread_cleanup_pop(1); return NULL; } + +void health_add_host_labels(void) { + DICTIONARY *labels = localhost->rrdlabels; + + int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO); + rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_CONFIG); + + int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO); + rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_CONFIG); +} + +void health_thread_spawn(RRDHOST * host) { + if(!host->health_spawn) { + char tag[NETDATA_THREAD_TAG_MAX + 1]; + snprintfz(tag, NETDATA_THREAD_TAG_MAX, "HEALTH[%s]", rrdhost_hostname(host)); + struct health_state *health = callocz(1, sizeof(*health)); + health->host = host; + + if(netdata_thread_create(&host->health_thread, tag, NETDATA_THREAD_OPTION_JOINABLE, health_main, (void *) health)) { + log_health("[%s]: Failed to create new thread for client.", rrdhost_hostname(host)); + error("HEALTH [%s]: Failed to create new thread for client.", rrdhost_hostname(host)); + } + else { + log_health("[%s]: Created new thread for client.", rrdhost_hostname(host)); + host->health_spawn = 1; + host->aclk_alert_reloaded = 1; + } + } +} |