diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 112 |
1 files changed, 74 insertions, 38 deletions
diff --git a/health/health.c b/health/health.c index 3c1e5693..9eb36a9c 100644 --- a/health/health.c +++ b/health/health.c @@ -11,6 +11,12 @@ static struct { ALARM_ENTRY *tail; // latest } alarm_notifications_in_progress = {NULL, NULL}; +typedef struct active_alerts { + char *name; + time_t last_status_change; + RRDCALC_STATUS status; +} active_alerts_t; + static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae) { ae->prev_in_progress = NULL; @@ -219,10 +225,6 @@ static void health_reload_host(RRDHOST *host) { * Reload the host configuration for all hosts. */ void health_reload(void) { -#ifdef ENABLE_ACLK - if (netdata_cloud_setting) - aclk_single_update_disable(); -#endif sql_refresh_hashes(); rrd_rdlock(); @@ -234,11 +236,7 @@ void health_reload(void) { rrd_unlock(); #ifdef ENABLE_ACLK if (netdata_cloud_setting) { - aclk_single_update_enable(); - aclk_alarm_reload(); -#ifdef ENABLE_NEW_CLOUD_PROTOCOL aclk_alert_reloaded = 1; -#endif } #endif } @@ -246,13 +244,22 @@ void health_reload(void) { // ---------------------------------------------------------------------------- // health main thread and friends -static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) { +static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) { if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED; if(n) return RRDCALC_STATUS_RAISED; return RRDCALC_STATUS_CLEAR; } #define ALARM_EXEC_COMMAND_LENGTH 8192 +#define ACTIVE_ALARMS_LIST_EXAMINE 500 +#define ACTIVE_ALARMS_LIST 15 + +static inline int compare_active_alerts(const void * a, const void * b) { + active_alerts_t *active_alerts_a = (active_alerts_t *)a; + active_alerts_t *active_alerts_b = (active_alerts_t *)b; + + return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change ); +} static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED; @@ -318,31 +325,28 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { RRDCALC *rc; EVAL_EXPRESSION *expr=NULL; BUFFER *warn_alarms, *crit_alarms; + active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t)); warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); - for(rc = host->alarms; rc ; rc = rc->next) { + for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - if (n_warn) - buffer_strcat(warn_alarms, ","); - buffer_strcat(warn_alarms, rc->name); - buffer_strcat(warn_alarms, "="); - buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change); + active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; + active_alerts[n_warn+n_crit].status = rc->status; n_warn++; } else if (ae->alarm_id == rc->id) expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - if (n_crit) - buffer_strcat(crit_alarms, ","); - buffer_strcat(crit_alarms, rc->name); - buffer_strcat(crit_alarms, "="); - buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change); + active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; + active_alerts[n_warn+n_crit].status = rc->status; n_crit++; } else if (ae->alarm_id == rc->id) expr = rc->critical; @@ -352,9 +356,34 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } + if (n_warn+n_crit>1) + qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts); + + int count_w = 0, count_c = 0; + while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) { + if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) { + if (count_w) + buffer_strcat(warn_alarms, ","); + buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name); + buffer_strcat(warn_alarms, "="); + buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change); + count_w++; + } + else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) { + if (count_c) + buffer_strcat(crit_alarms, ","); + buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name); + buffer_strcat(crit_alarms, "="); + buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change); + count_c++; + } + } + char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO + "' '" NETDATA_DOUBLE_FORMAT_ZERO + "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -398,6 +427,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { freez(edit_command); buffer_free(warn_alarms); buffer_free(crit_alarms); + freez(active_alerts); return; //health_alarm_wait_for_execution done: @@ -419,7 +449,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { - debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s", + debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", ae->chart?ae->chart:"NOCHART", ae->name, ae->new_value, rrdcalc_status2string(ae->old_status), @@ -736,7 +766,7 @@ void *health_main(void *ptr) { rrdcalc_labels_unlink(); unsigned int loop = 0; -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK unsigned int marked_aclk_reload_loop = 0; #endif while(!netdata_exit) { @@ -765,7 +795,7 @@ void *health_main(void *ptr) { } } -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK if (aclk_alert_reloaded && !marked_aclk_reload_loop) marked_aclk_reload_loop = loop; #endif @@ -795,6 +825,11 @@ void *health_main(void *ptr) { host->health_delay_up_to = 0; } + // wait until cleanup of obsolete charts on children is complete + if (host != localhost) + if (unlikely(host->trigger_chart_obsoletion_check == 1)) + continue; + if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) sql_health_alarm_log_cleanup(host); @@ -818,7 +853,7 @@ void *health_main(void *ptr) { worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); time_t now = now_realtime_sec(); ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0); if (ae) { @@ -828,7 +863,7 @@ void *health_main(void *ptr) { rc->last_status_change = now; rc->last_updated = now; rc->value = NAN; -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) sql_queue_alarm_to_aclk(host, ae, 1); #endif @@ -855,10 +890,12 @@ void *health_main(void *ptr) { /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; - int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after, - rc->before, rc->group, 0, rc->options, &rc->db_after, - &rc->db_before, &value_is_null, 0 - ); + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, + rc->after, rc->before, rc->group, NULL, + 0, rc->options, + &rc->db_after,&rc->db_before, + NULL, NULL, NULL, + &value_is_null, NULL, 0, 0); if (unlikely(ret != 200)) { // database lookup failed @@ -898,8 +935,7 @@ void *health_main(void *ptr) { } else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " - CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value ); } @@ -923,7 +959,7 @@ void *health_main(void *ptr) { rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " - CALCULATED_NUMBER_FORMAT + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, rc->calculation->result, buffer_tostring(rc->calculation->error_msg), rc->source @@ -972,7 +1008,7 @@ void *health_main(void *ptr) { } else { rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " - CALCULATED_NUMBER_FORMAT + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source ); @@ -998,7 +1034,7 @@ void *health_main(void *ptr) { } else { rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " - CALCULATED_NUMBER_FORMAT + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source @@ -1077,7 +1113,7 @@ void *health_main(void *ptr) { ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last, @@ -1129,7 +1165,7 @@ void *health_main(void *ptr) { rc->last_repeat = now; if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, rc->delay_last, @@ -1178,7 +1214,7 @@ void *health_main(void *ptr) { health_alarm_wait_for_execution(ae); } -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { rrdhost_foreach_read(host) { if (unlikely(!host->health_enabled)) |