diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-02-08 07:31:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-02-08 07:31:03 +0000 |
commit | 50485bedfd9818165aa1d039d0abe95a559134b7 (patch) | |
tree | 79c7b08f67edcfb0c936e7a22931653b91189b9f /health/health.c | |
parent | Releasing debian version 1.11.1+dfsg-7. (diff) | |
download | netdata-50485bedfd9818165aa1d039d0abe95a559134b7.tar.xz netdata-50485bedfd9818165aa1d039d0abe95a559134b7.zip |
Merging upstream version 1.12.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 784 |
1 files changed, 425 insertions, 359 deletions
diff --git a/health/health.c b/health/health.c index ae0c464b1..f92a1ba6b 100644 --- a/health/health.c +++ b/health/health.c @@ -2,6 +2,12 @@ #include "health.h" +struct health_cmdapi_thread_status { + int status; + ; + struct rusage rusage; +}; + unsigned int default_health_enabled = 1; // ---------------------------------------------------------------------------- @@ -147,13 +153,41 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } + // Check if alarm notifications are silenced + if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { + info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + goto done; + } + static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; pid_t command_pid; const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s'", + int n_warn=0, n_crit=0; + RRDCALC *rc; + EVAL_EXPRESSION *expr=NULL; + + for(rc = host->alarms; rc ; rc = rc->next) { + if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) + continue; + + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + n_warn++; + if (ae->alarm_id == rc->id) + expr=rc->warning; + } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { + n_crit++; + if (ae->alarm_id == rc->id) + expr=rc->critical; + } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { + if (ae->alarm_id == rc->id) + expr=rc->warning; + } + } + + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d'", exec, recipient, host->registry_hostname, @@ -162,7 +196,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->alarm_event_id, (unsigned long)ae->when, ae->name, - ae->chart?ae->chart:"NOCAHRT", + ae->chart?ae->chart:"NOCHART", ae->family?ae->family:"NOFAMILY", rrdcalc_status2string(ae->new_status), rrdcalc_status2string(ae->old_status), @@ -174,7 +208,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->units?ae->units:"", ae->info?ae->info:"", ae->new_value_string, - ae->old_value_string + ae->old_value_string, + (expr && expr->source)?expr->source:"NOSOURCE", + (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", + n_warn, + n_crit ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -355,6 +393,67 @@ static void health_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } +SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { + SILENCER *s; + debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", + rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); + + for (s = silencers->silencers; s!=NULL; s=s->next){ + if ( + (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) && + (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) && + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) && + (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family))) + ) { + debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); + if (unlikely(silencers->stype == STYPE_NONE)) { + debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); + } else { + debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" + , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , rc->name + , (rc->rrdset)?rc->rrdset->context:"" + , rc->chart + , host + , (rc->rrdset)?rc->rrdset->family:"" + ); + } + return silencers->stype; + } + } + return STYPE_NONE; +} + +int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { + uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; + // Clear the flags + rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); + if (unlikely(silencers->all_alarms)) { + if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; + else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + } else { + SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers); + if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; + else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + } + + if (rrdcalc_flags_old != rc->rrdcalc_flags) { + info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", + host->hostname, + rc->name, + (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", + (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false", + (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false", + (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false" + ); + } + if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) + return 1; + else + return 0; +} + void *health_main(void *ptr) { netdata_thread_cleanup_push(health_main_cleanup, ptr); @@ -365,371 +464,338 @@ void *health_main(void *ptr) { time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); unsigned int loop = 0; - while(!netdata_exit) { - loop++; - debug(D_HEALTH, "Health monitoring iteration no %u started", loop); - - int runnable = 0, apply_hibernation_delay = 0; - time_t next_run = now + min_run_every; - RRDCALC *rc; - - if(unlikely(check_if_resumed_from_suspention())) { - apply_hibernation_delay = 1; - - info("Postponing alarm checks for %ld seconds, because it seems that the system was just resumed from suspension." - , hibernation_delay - ); - } - - rrd_rdlock(); - - RRDHOST *host; - rrdhost_foreach_read(host) { - if(unlikely(!host->health_enabled)) - continue; - - if(unlikely(apply_hibernation_delay)) { - info("Postponing health checks for %ld seconds, on host '%s'." - , hibernation_delay - , host->hostname - ); + silencers = mallocz(sizeof(SILENCERS)); + silencers->all_alarms=0; + silencers->stype=STYPE_NONE; + silencers->silencers=NULL; - host->health_delay_up_to = now + hibernation_delay; - } - - if(unlikely(host->health_delay_up_to)) { - if(unlikely(now < host->health_delay_up_to)) - continue; - - info("Resuming health checks on host '%s'.", host->hostname); - host->health_delay_up_to = 0; - } - - rrdhost_rdlock(host); - - // the first loop is to lookup values from the db - for(rc = host->alarms; rc; rc = rc->next) { - if(unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { - if(unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; - continue; - } - - runnable++; - rc->old_value = rc->value; - rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - - // ------------------------------------------------------------ - // if there is database lookup, do it - - if(unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { - /* time_t old_db_timestamp = rc->db_before; */ - int value_is_null = 0; - - int ret = rrdset2value_api_v1(rc->rrdset - , NULL - , &rc->value - , rc->dimensions - , 1 - , rc->after - , rc->before - , rc->group - , 0 - , rc->options - , &rc->db_after - , &rc->db_before - , &value_is_null - ); - - if(unlikely(ret != 200)) { - // database lookup failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': database lookup returned error %d" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , ret - ); - } - else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; - - /* - RRDCALC_FLAG_DB_STALE not currently used - if (unlikely(old_db_timestamp == rc->db_before)) { - // database is stale - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; - error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - } - } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; - */ - - if(unlikely(value_is_null)) { - // collected value is null - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - ); - } - else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': database lookup gave value " CALCULATED_NUMBER_FORMAT - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->value - ); - } - - // ------------------------------------------------------------ - // if there is calculation expression, run it - - if(unlikely(rc->calculation)) { - if(unlikely(!expression_evaluate(rc->calculation))) { - // calculation failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->calculation->parsed_as - , buffer_tostring(rc->calculation->error_msg) - ); - } - else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->calculation->parsed_as - , rc->calculation->result - , buffer_tostring(rc->calculation->error_msg) - , rc->source - ); - - rc->value = rc->calculation->result; - - if(rc->local) rc->local->last_updated = now; - if(rc->family) rc->family->last_updated = now; - if(rc->hostid) rc->hostid->last_updated = now; - if(rc->hostname) rc->hostname->last_updated = now; - } - } - } - rrdhost_unlock(host); - - if(unlikely(runnable && !netdata_exit)) { - rrdhost_rdlock(host); - - for(rc = host->alarms; rc; rc = rc->next) { - if(unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) - continue; - - RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; - RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; - - // -------------------------------------------------------- - // check the warning expression - - if(likely(rc->warning)) { - if(unlikely(!expression_evaluate(rc->warning))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , buffer_tostring(rc->warning->error_msg) - ); - } - else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': warning expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->warning->result - , buffer_tostring(rc->warning->error_msg) - , rc->source - ); - warning_status = rrdcalc_value2status(rc->warning->result); - } - } - - // -------------------------------------------------------- - // check the critical expression - - if(likely(rc->critical)) { - if(unlikely(!expression_evaluate(rc->critical))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , buffer_tostring(rc->critical->error_msg) - ); - } - else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH - , "Health on host '%s', alarm '%s.%s': critical expression gave value " CALCULATED_NUMBER_FORMAT ": %s (source: %s)" - , host->hostname - , rc->chart ? rc->chart : "NOCHART" - , rc->name - , rc->critical->result - , buffer_tostring(rc->critical->error_msg) - , rc->source - ); - critical_status = rrdcalc_value2status(rc->critical->result); - } - } - - // -------------------------------------------------------- - // decide the final alarm status - - RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; - - switch(warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; - - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; - - default: - break; - } - - switch(critical_status) { - case RRDCALC_STATUS_CLEAR: - if(status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; - - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; - - default: - break; - } - - // -------------------------------------------------------- - // check if the new status and the old differ - - if(status != rc->status) { - int delay = 0; - - // apply trigger hysteresis - - if(now > rc->delay_up_to_timestamp) { - rc->delay_up_current = rc->delay_up_duration; - rc->delay_down_current = rc->delay_down_duration; - rc->delay_last = 0; - rc->delay_up_to_timestamp = 0; - } - else { - rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); - if(rc->delay_up_current > rc->delay_max_duration) - rc->delay_up_current = rc->delay_max_duration; - - rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); - if(rc->delay_down_current > rc->delay_max_duration) - rc->delay_down_current = rc->delay_max_duration; - } - - if(status > rc->status) - delay = rc->delay_up_current; - else - delay = rc->delay_down_current; - - // COMMENTED: because we do need to send raising alarms - // if(now + delay < rc->delay_up_to_timestamp) - // delay = (int)(rc->delay_up_to_timestamp - now); - - rc->delay_last = delay; - rc->delay_up_to_timestamp = now + delay; - - // add the alarm into the log - - health_alarm_log( - host - , rc->id - , rc->next_event_id++ - , now - , rc->name - , rc->rrdset->id - , rc->rrdset->family - , rc->exec - , rc->recipient - , now - rc->last_status_change - , rc->old_value - , rc->value - , rc->status - , status - , rc->source - , rc->units - , rc->info - , rc->delay_last - , (rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION) ? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0 - ); - - rc->last_status_change = now; - rc->status = status; - } - - rc->last_updated = now; - rc->next_update = now + rc->update_every; + while(!netdata_exit) { + loop++; + debug(D_HEALTH, "Health monitoring iteration no %u started", loop); - if(next_run > rc->next_update) - next_run = rc->next_update; - } + int runnable = 0, apply_hibernation_delay = 0; + time_t next_run = now + min_run_every; + RRDCALC *rc; + + if (unlikely(check_if_resumed_from_suspention())) { + apply_hibernation_delay = 1; + + info("Postponing alarm checks for %ld seconds, because it seems that the system was just resumed from suspension.", + hibernation_delay + ); + } + + if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { + static int logged=0; + if (!logged) { + info("Skipping health checks, because all alarms are disabled via a %s command.", + HEALTH_CMDAPI_CMD_DISABLEALL); + logged = 1; + } + } + + rrd_rdlock(); + + RRDHOST *host; + rrdhost_foreach_read(host) { + if (unlikely(!host->health_enabled)) + continue; + + if (unlikely(apply_hibernation_delay)) { + + info("Postponing health checks for %ld seconds, on host '%s'.", hibernation_delay, host->hostname + ); + + host->health_delay_up_to = now + hibernation_delay; + } + + if (unlikely(host->health_delay_up_to)) { + if (unlikely(now < host->health_delay_up_to)) + continue; + + info("Resuming health checks on host '%s'.", host->hostname); + host->health_delay_up_to = 0; + } + + rrdhost_rdlock(host); + + // the first loop is to lookup values from the db + for (rc = host->alarms; rc; rc = rc->next) { + + if (update_disabled_silenced(host, rc)) + continue; + + if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { + if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; + continue; + } + + runnable++; + rc->old_value = rc->value; + rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; + + // ------------------------------------------------------------ + // if there is database lookup, do it + + if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + /* time_t old_db_timestamp = rc->db_before; */ + int value_is_null = 0; + + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after, + rc->before, rc->group, 0, rc->options, &rc->db_after, + &rc->db_before, &value_is_null + ); + + if (unlikely(ret != 200)) { + // database lookup failed + rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret + ); + } else + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; + + /* - RRDCALC_FLAG_DB_STALE not currently used + if (unlikely(old_db_timestamp == rc->db_before)) { + // database is stale + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; + error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + } + } + else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; + */ + + if (unlikely(value_is_null)) { + // collected value is null + rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name + ); + } else + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " + CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rc->value + ); + } + + // ------------------------------------------------------------ + // if there is calculation expression, run it + + if (unlikely(rc->calculation)) { + if (unlikely(!expression_evaluate(rc->calculation))) { + // calculation failed + rc->value = NAN; + rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) + ); + } else { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + rc->calculation->parsed_as, rc->calculation->result, + buffer_tostring(rc->calculation->error_msg), rc->source + ); + + rc->value = rc->calculation->result; + + if (rc->local) rc->local->last_updated = now; + if (rc->family) rc->family->last_updated = now; + if (rc->hostid) rc->hostid->last_updated = now; + if (rc->hostname) rc->hostname->last_updated = now; + } + } + } + + rrdhost_unlock(host); + + if (unlikely(runnable && !netdata_exit)) { + rrdhost_rdlock(host); + + for (rc = host->alarms; rc; rc = rc->next) { + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) + continue; + + if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) { + continue; + } + RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; + RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; + + // -------------------------------------------------------- + // check the warning expression + + if (likely(rc->warning)) { + if (unlikely(!expression_evaluate(rc->warning))) { + // calculation failed + rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + buffer_tostring(rc->warning->error_msg) + ); + } else { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", + rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source + ); + warning_status = rrdcalc_value2status(rc->warning->result); + } + } + + // -------------------------------------------------------- + // check the critical expression + + if (likely(rc->critical)) { + if (unlikely(!expression_evaluate(rc->critical))) { + // calculation failed + rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", + host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + buffer_tostring(rc->critical->error_msg) + ); + } else { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " + CALCULATED_NUMBER_FORMAT + ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", + rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), + rc->source + ); + critical_status = rrdcalc_value2status(rc->critical->result); + } + } + + // -------------------------------------------------------- + // decide the final alarm status + + RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; + + switch (warning_status) { + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; + + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; + + default: + break; + } + + switch (critical_status) { + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; + + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; + + default: + break; + } + + // -------------------------------------------------------- + // check if the new status and the old differ + + if (status != rc->status) { + int delay = 0; + + // apply trigger hysteresis + + if (now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->delay_up_duration; + rc->delay_down_current = rc->delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } else { + rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); + if (rc->delay_up_current > rc->delay_max_duration) + rc->delay_up_current = rc->delay_max_duration; + + rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); + if (rc->delay_down_current > rc->delay_max_duration) + rc->delay_down_current = rc->delay_max_duration; + } + + if (status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + // COMMENTED: because we do need to send raising alarms + // if(now + delay < rc->delay_up_to_timestamp) + // delay = (int)(rc->delay_up_to_timestamp - now); + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + + health_alarm_log( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + + ); + + rc->last_status_change = now; + rc->status = status; + } + + rc->last_updated = now; + rc->next_update = now + rc->update_every; + + if (next_run > rc->next_update) + next_run = rc->next_update; + } + + rrdhost_unlock(host); + } - rrdhost_unlock(host); - } + if (unlikely(netdata_exit)) + break; - if(unlikely(netdata_exit)) - break; + // execute notifications + // and cleanup + health_alarm_log_process(host); - // execute notifications - // and cleanup - health_alarm_log_process(host); + if (unlikely(netdata_exit)) + break; - if(unlikely(netdata_exit)) - break; + } /* rrdhost_foreach */ - } /* rrdhost_foreach */ + rrd_unlock(); - rrd_unlock(); if(unlikely(netdata_exit)) break; |