summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/health.c132
1 files changed, 104 insertions, 28 deletions
diff --git a/health/health.c b/health/health.c
index 329191fb8..b81361e8a 100644
--- a/health/health.c
+++ b/health/health.c
@@ -2,14 +2,49 @@
#include "health.h"
-struct health_cmdapi_thread_status {
- int status;
- ;
- struct rusage rusage;
-};
-
unsigned int default_health_enabled = 1;
+char *silencers_filename;
+
+// the queue of executed alarm notifications that haven't been waited for yet
+static struct {
+ ALARM_ENTRY *head; // oldest
+ ALARM_ENTRY *tail; // latest
+} alarm_notifications_in_progress = {NULL, NULL};
+
+static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
+{
+ ae->prev_in_progress = NULL;
+ ae->next_in_progress = NULL;
+
+ if (NULL != alarm_notifications_in_progress.tail) {
+ ae->prev_in_progress = alarm_notifications_in_progress.tail;
+ alarm_notifications_in_progress.tail->next_in_progress = ae;
+ }
+ if (NULL == alarm_notifications_in_progress.head) {
+ alarm_notifications_in_progress.head = ae;
+ }
+ alarm_notifications_in_progress.tail = ae;
+
+}
+static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
+{
+ struct alarm_entry *prev = ae->prev_in_progress;
+ struct alarm_entry *next = ae->next_in_progress;
+
+ if (NULL != prev) {
+ prev->next_in_progress = next;
+ }
+ if (NULL != next) {
+ next->prev_in_progress = prev;
+ }
+ if (ae == alarm_notifications_in_progress.head) {
+ alarm_notifications_in_progress.head = next;
+ }
+ if (ae == alarm_notifications_in_progress.tail) {
+ alarm_notifications_in_progress.tail = prev;
+ }
+}
// ----------------------------------------------------------------------------
// health initialization
@@ -44,7 +79,7 @@ inline char *health_stock_config_dir(void) {
*
* Function used to initialize the silencer structure.
*/
-void health_silencers_init(void) {
+static void health_silencers_init(void) {
FILE *fd = fopen(silencers_filename, "r");
if (fd) {
fseek(fd, 0 , SEEK_END);
@@ -70,7 +105,7 @@ void health_silencers_init(void) {
}
fclose(fd);
} else {
- error("Cannot open the file %s",silencers_filename);
+ info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
}
}
@@ -100,7 +135,7 @@ void health_init(void) {
*
* @param host the structure of the host that the function will reload the configuration.
*/
-void health_reload_host(RRDHOST *host) {
+static void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health_enabled))
return;
@@ -152,9 +187,14 @@ void health_reload_host(RRDHOST *host) {
rrdhost_wrlock(host);
health_readdir(host, user_path, stock_path, NULL);
+ //Discard alarms with labels that do not apply to host
+ rrdcalc_labels_unlink_alarm_from_host(host);
+
// link the loaded alarms to their charts
RRDDIM *rd;
rrdset_foreach_write(st, host) {
+ if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
+ continue;
rrdsetcalc_link_matching(st);
rrdcalctemplate_link_matching(st);
@@ -175,7 +215,10 @@ void health_reload_host(RRDHOST *host) {
* Reload the host configuration for all hosts.
*/
void health_reload(void) {
-
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting)
+ aclk_single_update_disable();
+#endif
rrd_rdlock();
RRDHOST *host;
@@ -183,6 +226,12 @@ void health_reload(void) {
health_reload_host(host);
rrd_unlock();
+#ifdef ENABLE_ACLK
+ if (netdata_cloud_setting) {
+ aclk_single_update_enable();
+ aclk_alarm_reload();
+ }
+#endif
}
// ----------------------------------------------------------------------------
@@ -252,7 +301,6 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
}
static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
- pid_t command_pid;
const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
@@ -308,25 +356,30 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
- ae->exec_run_timestamp = now_realtime_sec();
+ ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
debug(D_HEALTH, "executing command '%s'", command_to_run);
- FILE *fp = mypopen(command_to_run, &command_pid);
- if(!fp) {
- error("HEALTH: Cannot popen(\"%s\", \"r\").", command_to_run);
- goto done;
- }
- debug(D_HEALTH, "HEALTH reading from command (discarding command's output)");
- char buffer[100 + 1];
- while(fgets(buffer, 100, fp) != NULL) ;
- ae->exec_code = mypclose(fp, command_pid);
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
+ ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
+ enqueue_alarm_notify_in_progress(ae);
+
+ return; //health_alarm_wait_for_execution
+done:
+ health_alarm_log_save(host, ae);
+}
+
+static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
+ if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
+ return;
+
+ spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
+ ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
if(ae->exec_code != 0)
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
-done:
- health_alarm_log_save(host, ae);
+ unlink_alarm_notify_in_progress(ae);
}
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
@@ -388,6 +441,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
if(likely(!alarm_entry_isrepeating(host, ae))) {
+ health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
host->health_log.count--;
}
@@ -430,14 +484,21 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
return 0;
}
+ if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
+ return 0;
+ }
+
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
return 0;
}
int update_every = rc->rrdset->update_every;
- time_t first = rrdset_first_entry_t(rc->rrdset);
- time_t last = rrdset_last_entry_t(rc->rrdset);
+ rrdset_rdlock(rc->rrdset);
+ time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
+ time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
+ rrdset_unlock(rc->rrdset);
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
debug(D_HEALTH
@@ -488,7 +549,7 @@ static void health_main_cleanup(void *ptr) {
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
}
-SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
+static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
SILENCER *s;
debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
@@ -530,7 +591,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
*
* @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
*/
-int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
+static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
// Clear the flags
rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
@@ -577,6 +638,8 @@ void *health_main(void *ptr) {
time_t now = now_realtime_sec();
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
+ rrdcalc_labels_unlink();
+
unsigned int loop = 0;
while(!netdata_exit) {
loop++;
@@ -930,6 +993,7 @@ void *health_main(void *ptr) {
rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
health_process_notifications(host, ae);
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
}
}
@@ -944,11 +1008,23 @@ void *health_main(void *ptr) {
// and cleanup
health_alarm_log_process(host);
- if (unlikely(netdata_exit))
+ if (unlikely(netdata_exit)) {
+ // wait for all notifications to finish before allowing health to be cleaned up
+ ALARM_ENTRY *ae;
+ while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ health_alarm_wait_for_execution(ae);
+ }
break;
+ }
} /* rrdhost_foreach */
+ // wait for all notifications to finish before allowing health to be cleaned up
+ ALARM_ENTRY *ae;
+ while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ health_alarm_wait_for_execution(ae);
+ }
+
rrd_unlock();