diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 145 |
1 files changed, 57 insertions, 88 deletions
diff --git a/health/health.c b/health/health.c index b34f54ab..5c2b85bc 100644 --- a/health/health.c +++ b/health/health.c @@ -17,6 +17,11 @@ #error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10 #endif +unsigned int default_health_enabled = 1; +char *silencers_filename; +SIMPLE_PATTERN *conf_enabled_alarms = NULL; +DICTIONARY *health_rrdvars; + static bool prepare_command(BUFFER *wb, const char *exec, const char *recipient, @@ -157,10 +162,6 @@ static bool prepare_command(BUFFER *wb, return true; } -unsigned int default_health_enabled = 1; -char *silencers_filename; -SIMPLE_PATTERN *conf_enabled_alarms = NULL; - // the queue of executed alarm notifications that haven't been waited for yet static struct { ALARM_ENTRY *head; // oldest @@ -346,6 +347,15 @@ static void health_reload_host(RRDHOST *host) { rrdcalctemplate_link_matching_templates_to_rrdset(st); } rrdset_foreach_done(st); + +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (likely(wc)) { + wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; + } + } +#endif } /** @@ -356,19 +366,11 @@ static void health_reload_host(RRDHOST *host) { void health_reload(void) { sql_refresh_hashes(); - rrd_rdlock(); - RRDHOST *host; - rrdhost_foreach_read(host) + dfe_start_reentrant(rrdhost_root_index, host){ health_reload_host(host); - - rrd_unlock(); - -#ifdef ENABLE_ACLK - if (netdata_cloud_setting) { - aclk_alert_reloaded = 1; } -#endif + dfe_done(host); } // ---------------------------------------------------------------------------- @@ -752,7 +754,8 @@ static void health_main_cleanup(void *ptr) { log_health("Health thread ended."); } -static void initialize_health(RRDHOST *host, int is_localhost) { +static void initialize_health(RRDHOST *host) +{ if(!host->health.health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) || !service_running(SERVICE_HEALTH)) @@ -779,25 +782,13 @@ static void initialize_health(RRDHOST *host, int is_localhost) { else host->health_log.max = (unsigned int)n; - conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT); + conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, + SIMPLE_PATTERN_EXACT, true); netdata_rwlock_init(&host->health_log.alarm_log_rwlock); char filename[FILENAME_MAX + 1]; - if(!is_localhost) { - int r = mkdir(host->varlib_dir, 0775); - if (r != 0 && errno != EEXIST) - error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir); - } - - { - snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir); - int r = mkdir(filename, 0775); - if(r != 0 && errno != EEXIST) - error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename); - } - snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir); host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); host->health.health_default_recipient = string_strdupz("root"); @@ -814,7 +805,7 @@ static void initialize_health(RRDHOST *host, int is_localhost) { // link the loaded alarms to their charts RRDSET *st; - rrdset_foreach_write(st, host) { + rrdset_foreach_reentrant(st, host) { if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) continue; @@ -849,11 +840,11 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil for (s = silencers->silencers; s!=NULL; s=s->next){ if ( - (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) && - (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) && - (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) && - (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) && - (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset)))) + (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches_string(s->alarms_pattern, rc->name))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches_string(s->contexts_pattern, rc->rrdset->context))) && + (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern, host))) && + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) && + (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family))) ) { debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); if (unlikely(silencers->stype == STYPE_NONE)) { @@ -925,19 +916,6 @@ static void health_execute_delayed_initializations(RRDHOST *host) { worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET); - if(!st->rrdfamily) - st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st)); - - if(!st->rrdvars) - st->rrdvars = rrdvariables_create(); - - rrddimvar_index_init(st); - - rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); - rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE); - rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE); - rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE); - rrdcalc_link_matching_alerts_to_rrdset(st); rrdcalctemplate_link_matching_templates_to_rrdset(st); @@ -948,19 +926,19 @@ static void health_execute_delayed_initializations(RRDHOST *host) { worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM); - rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE); - rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE); - rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); - RRDCALCTEMPLATE *rt; foreach_rrdcalctemplate_read(host, rt) { if(!rt->foreach_dimension_pattern) continue; - if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) + if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) { rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host); + } } foreach_rrdcalctemplate_done(rt); + + if (health_variable_check(health_rrdvars, st, rd)) + rrdvar_store_for_chart(host, st); } rrddim_foreach_done(rd); } @@ -1002,9 +980,7 @@ void *health_main(void *ptr) { rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts(); unsigned int loop = 0; -#ifdef ENABLE_ACLK - unsigned int marked_aclk_reload_loop = 0; -#endif + while(service_running(SERVICE_HEALTH)) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); @@ -1033,15 +1009,8 @@ void *health_main(void *ptr) { } } -#ifdef ENABLE_ACLK - if (aclk_alert_reloaded && !marked_aclk_reload_loop) - marked_aclk_reload_loop = loop; -#endif - worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); - rrd_rdlock(); - - rrdhost_foreach_read(host) { + dfe_start_reentrant(rrdhost_root_index, host) { if(unlikely(!service_running(SERVICE_HEALTH))) break; @@ -1049,11 +1018,8 @@ void *health_main(void *ptr) { if (unlikely(!host->health.health_enabled)) continue; - if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) { - rrd_unlock(); - initialize_health(host, host == localhost); - rrd_rdlock(); - } + if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) + initialize_health(host); health_execute_delayed_initializations(host); @@ -1147,7 +1113,7 @@ void *health_main(void *ptr) { rc->value = NAN; #ifdef ENABLE_ACLK - if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) + if (netdata_cloud_setting) sql_queue_alarm_to_aclk(host, ae, 1); #endif } @@ -1518,9 +1484,28 @@ void *health_main(void *ptr) { } break; } - } //for each host +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; + if (unlikely(!wc)) { + continue; + } + + if (wc->alert_queue_removed == 1) { + sql_queue_removed_alerts_to_aclk(host); + } else if (wc->alert_queue_removed > 1) { + wc->alert_queue_removed--; + } - rrd_unlock(); + if (wc->alert_checkpoint_req == 1) { + aclk_push_alarm_checkpoint(host); + } else if (wc->alert_checkpoint_req > 1) { + wc->alert_checkpoint_req--; + } + } +#endif + } + dfe_done(host); // wait for all notifications to finish before allowing health to be cleaned up ALARM_ENTRY *ae; @@ -1531,22 +1516,6 @@ void *health_main(void *ptr) { health_alarm_wait_for_execution(ae); } -#ifdef ENABLE_ACLK - if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { - rrdhost_foreach_read(host) { - if(unlikely(!service_running(SERVICE_HEALTH))) - break; - - if (unlikely(!host->health.health_enabled)) - continue; - - sql_queue_removed_alerts_to_aclk(host); - } - aclk_alert_reloaded = 0; - marked_aclk_reload_loop = 0; - } -#endif - if(unlikely(!service_running(SERVICE_HEALTH))) break; |