diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 73 |
1 files changed, 60 insertions, 13 deletions
diff --git a/health/health.c b/health/health.c index 85d2a245..d8e1d4b7 100644 --- a/health/health.c +++ b/health/health.c @@ -230,6 +230,9 @@ void health_reload(void) { if (netdata_cloud_setting) { aclk_single_update_enable(); aclk_alarm_reload(); +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + aclk_alert_reloaded = 1; +#endif } #endif } @@ -308,26 +311,44 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { int n_warn=0, n_crit=0; RRDCALC *rc; EVAL_EXPRESSION *expr=NULL; + BUFFER *warn_alarms, *crit_alarms; + + warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); + crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); for(rc = host->alarms; rc ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; - if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { - n_warn++; - if (ae->alarm_id == rc->id) - expr=rc->warning; + if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { + if (n_warn) + buffer_strcat(warn_alarms, ","); + buffer_strcat(warn_alarms, rc->name); + buffer_strcat(warn_alarms, "="); + buffer_snprintf(warn_alarms, 11, "%ld", rc->last_status_change); + n_warn++; + } else if (ae->alarm_id == rc->id) + expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { - n_crit++; - if (ae->alarm_id == rc->id) - expr=rc->critical; + if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { + if (n_crit) + buffer_strcat(crit_alarms, ","); + buffer_strcat(crit_alarms, rc->name); + buffer_strcat(crit_alarms, "="); + buffer_snprintf(crit_alarms, 11, "%ld", rc->last_status_change); + n_crit++; + } else if (ae->alarm_id == rc->id) + expr = rc->critical; } else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { if (ae->alarm_id == rc->id) - expr=rc->warning; + expr = rc->warning; } } - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d'", + char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0"); + + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -352,7 +373,12 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { (expr && expr->source)?expr->source:"NOSOURCE", (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", n_warn, - n_crit + n_crit, + buffer_tostring(warn_alarms), + buffer_tostring(crit_alarms), + ae->classification?ae->classification:"Unknown", + edit_command, + localhost->registry_hostname ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -363,6 +389,10 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); + freez(edit_command); + buffer_free(warn_alarms); + buffer_free(crit_alarms); + return; //health_alarm_wait_for_execution done: health_alarm_log_save(host, ae); @@ -635,6 +665,8 @@ void *health_main(void *ptr) { int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); if(min_run_every < 1) min_run_every = 1; + int cleanup_sql_every_loop = 7200 / min_run_every; + time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); @@ -689,6 +721,9 @@ void *health_main(void *ptr) { host->health_delay_up_to = 0; } + if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) + sql_health_alarm_log_cleanup(host); + rrdhost_rdlock(host); // the first loop is to lookup values from the db @@ -929,7 +964,7 @@ void *health_main(void *ptr) { if(likely(!rrdcalc_isrepeating(rc))) { ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last, @@ -979,7 +1014,7 @@ void *health_main(void *ptr) { if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { rc->last_repeat = now; ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, rc->delay_last, @@ -1003,6 +1038,14 @@ void *health_main(void *ptr) { rrdhost_unlock(host); } +#ifdef ENABLE_ACLK +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > 2) { + sql_queue_removed_alerts_to_aclk(host); + } +#endif +#endif + if (unlikely(netdata_exit)) break; @@ -1027,8 +1070,12 @@ void *health_main(void *ptr) { health_alarm_wait_for_execution(ae); } - rrd_unlock(); +#ifdef ENABLE_NEW_CLOUD_PROTOCOL + if (netdata_cloud_setting && unlikely(aclk_alert_reloaded)) + aclk_alert_reloaded = 0; +#endif + rrd_unlock(); if(unlikely(netdata_exit)) break; |