summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/health.c126
1 files changed, 82 insertions, 44 deletions
diff --git a/health/health.c b/health/health.c
index 528238d74..3c1e5693e 100644
--- a/health/health.c
+++ b/health/health.c
@@ -58,7 +58,7 @@ static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
inline char *health_user_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
- return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
+ return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
}
/**
@@ -71,7 +71,7 @@ inline char *health_user_config_dir(void) {
inline char *health_stock_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
- return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer);
+ return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
}
/**
@@ -354,7 +354,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s'",
+ snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
host->registry_hostname,
@@ -383,7 +383,8 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
buffer_tostring(warn_alarms),
buffer_tostring(crit_alarms),
ae->classification?ae->classification:"Unknown",
- edit_command
+ edit_command,
+ host != localhost ? host->machine_guid:""
);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
@@ -453,9 +454,11 @@ static inline void health_alarm_log_process(RRDHOST *host) {
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
+ bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
+
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- if(host->health_log.count <= host->health_log.max)
+ if (!cleanup_excess_log_entries)
return;
// cleanup excess entries in the log
@@ -514,11 +517,6 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
return 0;
}
- if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
-
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
return 0;
@@ -576,6 +574,8 @@ static inline int check_if_resumed_from_suspension(void) {
}
static void health_main_cleanup(void *ptr) {
+ worker_unregister();
+
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
@@ -658,35 +658,34 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
// Create alarms for dimensions that have been added to charts
// since the previous iteration.
static void init_pending_foreach_alarms(RRDHOST *host) {
- rrdhost_wrlock(host);
+ RRDSET *st;
+ RRDDIM *rd;
- if (host->alarms_with_foreach || host->alarms_template_with_foreach) {
- if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) {
- RRDSET *st;
+ if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
+ return;
- rrdset_foreach_read(st, host) {
- rrdset_wrlock(st);
+ rrdhost_wrlock(host);
- if (rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) {
- RRDDIM *rd;
+ rrdset_foreach_write(st, host) {
+ if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
+ continue;
- rrddim_foreach_write(rd, st) {
- if (rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) {
- rrdcalc_link_to_rrddim(rd, st, host);
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
- }
- }
+ rrdset_rdlock(st);
- rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
- }
+ rrddim_foreach_read(rd, st) {
+ if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
+ continue;
- rrdset_unlock(st);
- }
+ rrdcalc_link_to_rrddim(rd, st, host);
- rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
+ rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
}
+
+ rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
+ rrdset_unlock(st);
}
+ rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
rrdhost_unlock(host);
}
@@ -699,7 +698,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) {
*
* @return It always returns NULL
*/
+
+#define WORKER_HEALTH_JOB_RRD_LOCK 0
+#define WORKER_HEALTH_JOB_HOST_LOCK 1
+#define WORKER_HEALTH_JOB_DB_QUERY 2
+#define WORKER_HEALTH_JOB_CALC_EVAL 3
+#define WORKER_HEALTH_JOB_WARNING_EVAL 4
+#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
+#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
+#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
+#endif
+
void *health_main(void *ptr) {
+ worker_register("HEALTH");
+ worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
+ worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
+ worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
+ worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
+ worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
+ worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
+
netdata_thread_cleanup_push(health_main_cleanup, ptr);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
@@ -747,6 +770,7 @@ void *health_main(void *ptr) {
marked_aclk_reload_loop = loop;
#endif
+ worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
rrd_rdlock();
RRDHOST *host;
@@ -776,6 +800,7 @@ void *health_main(void *ptr) {
init_pending_foreach_alarms(host);
+ worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
rrdhost_rdlock(host);
// the first loop is to lookup values from the db
@@ -790,6 +815,7 @@ void *health_main(void *ptr) {
rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
if (!rrdcalc_isrepeating(rc)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
@@ -804,11 +830,10 @@ void *health_main(void *ptr) {
rc->value = NAN;
#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
- sql_queue_removed_alerts_to_aclk(host);
+ sql_queue_alarm_to_aclk(host, ae, 1);
#endif
}
}
- continue;
}
if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
@@ -825,6 +850,8 @@ void *health_main(void *ptr) {
// if there is database lookup, do it
if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
+ worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
+
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
@@ -881,6 +908,8 @@ void *health_main(void *ptr) {
// if there is calculation expression, run it
if (unlikely(rc->calculation)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
+
if (unlikely(!expression_evaluate(rc->calculation))) {
// calculation failed
rc->value = NAN;
@@ -929,6 +958,8 @@ void *health_main(void *ptr) {
// check the warning expression
if (likely(rc->warning)) {
+ worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
+
if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
@@ -953,6 +984,8 @@ void *health_main(void *ptr) {
// check the critical expression
if (likely(rc->critical)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
+
if (unlikely(!expression_evaluate(rc->critical))) {
// calculation failed
rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
@@ -1010,6 +1043,7 @@ void *health_main(void *ptr) {
// check if the new status and the old differ
if (status != rc->status) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
int delay = 0;
// apply trigger hysteresis
@@ -1041,19 +1075,19 @@ void *health_main(void *ptr) {
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
- if(likely(!rrdcalc_isrepeating(rc))) {
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- health_alarm_log(host, ae);
- }
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ health_alarm_log(host, ae);
+
rc->last_status_change = now;
rc->old_status = rc->status;
rc->status = status;
@@ -1091,7 +1125,9 @@ void *health_main(void *ptr) {
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
+ if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
@@ -1122,6 +1158,7 @@ void *health_main(void *ptr) {
// execute notifications
// and cleanup
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
health_alarm_log_process(host);
if (unlikely(netdata_exit)) {
@@ -1160,6 +1197,7 @@ void *health_main(void *ptr) {
now = now_realtime_sec();
if(now < next_run) {
+ worker_is_idle();
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
now = now_realtime_sec();