diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 132 |
1 files changed, 104 insertions, 28 deletions
diff --git a/health/health.c b/health/health.c index 329191fb..b81361e8 100644 --- a/health/health.c +++ b/health/health.c @@ -2,14 +2,49 @@ #include "health.h" -struct health_cmdapi_thread_status { - int status; - ; - struct rusage rusage; -}; - unsigned int default_health_enabled = 1; +char *silencers_filename; + +// the queue of executed alarm notifications that haven't been waited for yet +static struct { + ALARM_ENTRY *head; // oldest + ALARM_ENTRY *tail; // latest +} alarm_notifications_in_progress = {NULL, NULL}; + +static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae) +{ + ae->prev_in_progress = NULL; + ae->next_in_progress = NULL; + + if (NULL != alarm_notifications_in_progress.tail) { + ae->prev_in_progress = alarm_notifications_in_progress.tail; + alarm_notifications_in_progress.tail->next_in_progress = ae; + } + if (NULL == alarm_notifications_in_progress.head) { + alarm_notifications_in_progress.head = ae; + } + alarm_notifications_in_progress.tail = ae; + +} +static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae) +{ + struct alarm_entry *prev = ae->prev_in_progress; + struct alarm_entry *next = ae->next_in_progress; + + if (NULL != prev) { + prev->next_in_progress = next; + } + if (NULL != next) { + next->prev_in_progress = prev; + } + if (ae == alarm_notifications_in_progress.head) { + alarm_notifications_in_progress.head = next; + } + if (ae == alarm_notifications_in_progress.tail) { + alarm_notifications_in_progress.tail = prev; + } +} // ---------------------------------------------------------------------------- // health initialization @@ -44,7 +79,7 @@ inline char *health_stock_config_dir(void) { * * Function used to initialize the silencer structure. */ -void health_silencers_init(void) { +static void health_silencers_init(void) { FILE *fd = fopen(silencers_filename, "r"); if (fd) { fseek(fd, 0 , SEEK_END); @@ -70,7 +105,7 @@ void health_silencers_init(void) { } fclose(fd); } else { - error("Cannot open the file %s",silencers_filename); + info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename); } } @@ -100,7 +135,7 @@ void health_init(void) { * * @param host the structure of the host that the function will reload the configuration. */ -void health_reload_host(RRDHOST *host) { +static void health_reload_host(RRDHOST *host) { if(unlikely(!host->health_enabled)) return; @@ -152,9 +187,14 @@ void health_reload_host(RRDHOST *host) { rrdhost_wrlock(host); health_readdir(host, user_path, stock_path, NULL); + //Discard alarms with labels that do not apply to host + rrdcalc_labels_unlink_alarm_from_host(host); + // link the loaded alarms to their charts RRDDIM *rd; rrdset_foreach_write(st, host) { + if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) + continue; rrdsetcalc_link_matching(st); rrdcalctemplate_link_matching(st); @@ -175,7 +215,10 @@ void health_reload_host(RRDHOST *host) { * Reload the host configuration for all hosts. */ void health_reload(void) { - +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) + aclk_single_update_disable(); +#endif rrd_rdlock(); RRDHOST *host; @@ -183,6 +226,12 @@ void health_reload(void) { health_reload_host(host); rrd_unlock(); +#ifdef ENABLE_ACLK + if (netdata_cloud_setting) { + aclk_single_update_enable(); + aclk_alarm_reload(); + } +#endif } // ---------------------------------------------------------------------------- @@ -252,7 +301,6 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; - pid_t command_pid; const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; @@ -308,25 +356,30 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; - ae->exec_run_timestamp = now_realtime_sec(); + ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ debug(D_HEALTH, "executing command '%s'", command_to_run); - FILE *fp = mypopen(command_to_run, &command_pid); - if(!fp) { - error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run); - goto done; - } - debug(D_HEALTH, "HEALTH reading from command (discarding command's output)"); - char buffer[100 + 1]; - while(fgets(buffer, 100, fp) != NULL) ; - ae->exec_code = mypclose(fp, command_pid); + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; + ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); + enqueue_alarm_notify_in_progress(ae); + + return; //health_alarm_wait_for_execution +done: + health_alarm_log_save(host, ae); +} + +static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { + if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)) + return; + + spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp); debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); + ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; if(ae->exec_code != 0) ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED; -done: - health_alarm_log_save(host, ae); + unlink_alarm_notify_in_progress(ae); } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { @@ -388,6 +441,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; if(likely(!alarm_entry_isrepeating(host, ae))) { + health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); host->health_log.count--; } @@ -430,14 +484,21 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) return 0; } + if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); + return 0; + } + if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name); return 0; } int update_every = rc->rrdset->update_every; - time_t first = rrdset_first_entry_t(rc->rrdset); - time_t last = rrdset_last_entry_t(rc->rrdset); + rrdset_rdlock(rc->rrdset); + time_t first = rrdset_first_entry_t_nolock(rc->rrdset); + time_t last = rrdset_last_entry_t_nolock(rc->rrdset); + rrdset_unlock(rc->rrdset); if(unlikely(now + update_every < first /* || now - update_every > last */)) { debug(D_HEALTH @@ -488,7 +549,7 @@ static void health_main_cleanup(void *ptr) { static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; } -SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { +static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { SILENCER *s; debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); @@ -530,7 +591,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { * * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise */ -int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { +static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; // Clear the flags rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); @@ -577,6 +638,8 @@ void *health_main(void *ptr) { time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); + rrdcalc_labels_unlink(); + unsigned int loop = 0; while(!netdata_exit) { loop++; @@ -930,6 +993,7 @@ void *health_main(void *ptr) { rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE; health_process_notifications(host, ae); debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); } } @@ -944,11 +1008,23 @@ void *health_main(void *ptr) { // and cleanup health_alarm_log_process(host); - if (unlikely(netdata_exit)) + if (unlikely(netdata_exit)) { + // wait for all notifications to finish before allowing health to be cleaned up + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + health_alarm_wait_for_execution(ae); + } break; + } } /* rrdhost_foreach */ + // wait for all notifications to finish before allowing health to be cleaned up + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + health_alarm_wait_for_execution(ae); + } + rrd_unlock(); |