summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c145
1 files changed, 57 insertions, 88 deletions
diff --git a/health/health.c b/health/health.c
index b34f54ab5..5c2b85bc5 100644
--- a/health/health.c
+++ b/health/health.c
@@ -17,6 +17,11 @@
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
#endif
+unsigned int default_health_enabled = 1;
+char *silencers_filename;
+SIMPLE_PATTERN *conf_enabled_alarms = NULL;
+DICTIONARY *health_rrdvars;
+
static bool prepare_command(BUFFER *wb,
const char *exec,
const char *recipient,
@@ -157,10 +162,6 @@ static bool prepare_command(BUFFER *wb,
return true;
}
-unsigned int default_health_enabled = 1;
-char *silencers_filename;
-SIMPLE_PATTERN *conf_enabled_alarms = NULL;
-
// the queue of executed alarm notifications that haven't been waited for yet
static struct {
ALARM_ENTRY *head; // oldest
@@ -346,6 +347,15 @@ static void health_reload_host(RRDHOST *host) {
rrdcalctemplate_link_matching_templates_to_rrdset(st);
}
rrdset_foreach_done(st);
+
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting) {
+ struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
+ if (likely(wc)) {
+ wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
+ }
+ }
+#endif
}
/**
@@ -356,19 +366,11 @@ static void health_reload_host(RRDHOST *host) {
void health_reload(void) {
sql_refresh_hashes();
- rrd_rdlock();
-
RRDHOST *host;
- rrdhost_foreach_read(host)
+ dfe_start_reentrant(rrdhost_root_index, host){
health_reload_host(host);
-
- rrd_unlock();
-
-#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
- aclk_alert_reloaded = 1;
}
-#endif
+ dfe_done(host);
}
// ----------------------------------------------------------------------------
@@ -752,7 +754,8 @@ static void health_main_cleanup(void *ptr) {
log_health("Health thread ended.");
}
-static void initialize_health(RRDHOST *host, int is_localhost) {
+static void initialize_health(RRDHOST *host)
+{
if(!host->health.health_enabled ||
rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH) ||
!service_running(SERVICE_HEALTH))
@@ -779,25 +782,13 @@ static void initialize_health(RRDHOST *host, int is_localhost) {
else
host->health_log.max = (unsigned int)n;
- conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT);
+ conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
+ SIMPLE_PATTERN_EXACT, true);
netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
char filename[FILENAME_MAX + 1];
- if(!is_localhost) {
- int r = mkdir(host->varlib_dir, 0775);
- if (r != 0 && errno != EEXIST)
- error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir);
- }
-
- {
- snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
- int r = mkdir(filename, 0775);
- if(r != 0 && errno != EEXIST)
- error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename);
- }
-
snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
host->health.health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
host->health.health_default_recipient = string_strdupz("root");
@@ -814,7 +805,7 @@ static void initialize_health(RRDHOST *host, int is_localhost) {
// link the loaded alarms to their charts
RRDSET *st;
- rrdset_foreach_write(st, host) {
+ rrdset_foreach_reentrant(st, host) {
if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
continue;
@@ -849,11 +840,11 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil
for (s = silencers->silencers; s!=NULL; s=s->next){
if (
- (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
- (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
- (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
- (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
- (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
+ (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches_string(s->alarms_pattern, rc->name))) &&
+ (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches_string(s->contexts_pattern, rc->rrdset->context))) &&
+ (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern, host))) &&
+ (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) &&
+ (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family)))
) {
debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
if (unlikely(silencers->stype == STYPE_NONE)) {
@@ -925,19 +916,6 @@ static void health_execute_delayed_initializations(RRDHOST *host) {
worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
- if(!st->rrdfamily)
- st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
-
- if(!st->rrdvars)
- st->rrdvars = rrdvariables_create();
-
- rrddimvar_index_init(st);
-
- rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
- rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE);
- rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE);
- rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE);
-
rrdcalc_link_matching_alerts_to_rrdset(st);
rrdcalctemplate_link_matching_templates_to_rrdset(st);
@@ -948,19 +926,19 @@ static void health_execute_delayed_initializations(RRDHOST *host) {
worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
- rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE);
- rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE);
- rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
-
RRDCALCTEMPLATE *rt;
foreach_rrdcalctemplate_read(host, rt) {
if(!rt->foreach_dimension_pattern)
continue;
- if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
+ if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) {
rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
+ }
}
foreach_rrdcalctemplate_done(rt);
+
+ if (health_variable_check(health_rrdvars, st, rd))
+ rrdvar_store_for_chart(host, st);
}
rrddim_foreach_done(rd);
}
@@ -1002,9 +980,7 @@ void *health_main(void *ptr) {
rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
unsigned int loop = 0;
-#ifdef ENABLE_ACLK
- unsigned int marked_aclk_reload_loop = 0;
-#endif
+
while(service_running(SERVICE_HEALTH)) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
@@ -1033,15 +1009,8 @@ void *health_main(void *ptr) {
}
}
-#ifdef ENABLE_ACLK
- if (aclk_alert_reloaded && !marked_aclk_reload_loop)
- marked_aclk_reload_loop = loop;
-#endif
-
worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
- rrd_rdlock();
-
- rrdhost_foreach_read(host) {
+ dfe_start_reentrant(rrdhost_root_index, host) {
if(unlikely(!service_running(SERVICE_HEALTH)))
break;
@@ -1049,11 +1018,8 @@ void *health_main(void *ptr) {
if (unlikely(!host->health.health_enabled))
continue;
- if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))) {
- rrd_unlock();
- initialize_health(host, host == localhost);
- rrd_rdlock();
- }
+ if (unlikely(!rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)))
+ initialize_health(host);
health_execute_delayed_initializations(host);
@@ -1147,7 +1113,7 @@ void *health_main(void *ptr) {
rc->value = NAN;
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
+ if (netdata_cloud_setting)
sql_queue_alarm_to_aclk(host, ae, 1);
#endif
}
@@ -1518,9 +1484,28 @@ void *health_main(void *ptr) {
}
break;
}
- } //for each host
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting) {
+ struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
+ if (unlikely(!wc)) {
+ continue;
+ }
+
+ if (wc->alert_queue_removed == 1) {
+ sql_queue_removed_alerts_to_aclk(host);
+ } else if (wc->alert_queue_removed > 1) {
+ wc->alert_queue_removed--;
+ }
- rrd_unlock();
+ if (wc->alert_checkpoint_req == 1) {
+ aclk_push_alarm_checkpoint(host);
+ } else if (wc->alert_checkpoint_req > 1) {
+ wc->alert_checkpoint_req--;
+ }
+ }
+#endif
+ }
+ dfe_done(host);
// wait for all notifications to finish before allowing health to be cleaned up
ALARM_ENTRY *ae;
@@ -1531,22 +1516,6 @@ void *health_main(void *ptr) {
health_alarm_wait_for_execution(ae);
}
-#ifdef ENABLE_ACLK
- if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
- rrdhost_foreach_read(host) {
- if(unlikely(!service_running(SERVICE_HEALTH)))
- break;
-
- if (unlikely(!host->health.health_enabled))
- continue;
-
- sql_queue_removed_alerts_to_aclk(host);
- }
- aclk_alert_reloaded = 0;
- marked_aclk_reload_loop = 0;
- }
-#endif
-
if(unlikely(!service_running(SERVICE_HEALTH)))
break;