diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.c | 126 |
1 files changed, 82 insertions, 44 deletions
diff --git a/health/health.c b/health/health.c index 528238d74..3c1e5693e 100644 --- a/health/health.c +++ b/health/health.c @@ -58,7 +58,7 @@ static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae) inline char *health_user_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir); - return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer); + return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer); } /** @@ -71,7 +71,7 @@ inline char *health_user_config_dir(void) { inline char *health_stock_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir); - return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer); + return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer); } /** @@ -354,7 +354,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s'", + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -383,7 +383,8 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { buffer_tostring(warn_alarms), buffer_tostring(crit_alarms), ae->classification?ae->classification:"Unknown", - edit_command + edit_command, + host != localhost ? host->machine_guid:"" ); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; @@ -453,9 +454,11 @@ static inline void health_alarm_log_process(RRDHOST *host) { // remember this for the next iteration host->health_last_processed_id = first_waiting; + bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - if(host->health_log.count <= host->health_log.max) + if (!cleanup_excess_log_entries) return; // cleanup excess entries in the log @@ -514,11 +517,6 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) return 0; } - if(unlikely(!rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ENABLED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart is not enabled", rc->chart?rc->chart:"NOCHART", rc->name); - return 0; - } - if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); return 0; @@ -576,6 +574,8 @@ static inline int check_if_resumed_from_suspension(void) { } static void health_main_cleanup(void *ptr) { + worker_unregister(); + struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; @@ -658,35 +658,34 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { // Create alarms for dimensions that have been added to charts // since the previous iteration. static void init_pending_foreach_alarms(RRDHOST *host) { - rrdhost_wrlock(host); + RRDSET *st; + RRDDIM *rd; - if (host->alarms_with_foreach || host->alarms_template_with_foreach) { - if (rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) { - RRDSET *st; + if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) + return; - rrdset_foreach_read(st, host) { - rrdset_wrlock(st); + rrdhost_wrlock(host); - if (rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) { - RRDDIM *rd; + rrdset_foreach_write(st, host) { + if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) + continue; - rrddim_foreach_write(rd, st) { - if (rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) { - rrdcalc_link_to_rrddim(rd, st, host); - rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM); - } - } + rrdset_rdlock(st); - rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS); - } + rrddim_foreach_read(rd, st) { + if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) + continue; - rrdset_unlock(st); - } + rrdcalc_link_to_rrddim(rd, st, host); - rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS); + rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM); } + + rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS); + rrdset_unlock(st); } + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS); rrdhost_unlock(host); } @@ -699,7 +698,31 @@ static void init_pending_foreach_alarms(RRDHOST *host) { * * @return It always returns NULL */ + +#define WORKER_HEALTH_JOB_RRD_LOCK 0 +#define WORKER_HEALTH_JOB_HOST_LOCK 1 +#define WORKER_HEALTH_JOB_DB_QUERY 2 +#define WORKER_HEALTH_JOB_CALC_EVAL 3 +#define WORKER_HEALTH_JOB_WARNING_EVAL 4 +#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 +#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 +#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8 +#endif + void *health_main(void *ptr) { + worker_register("HEALTH"); + worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock"); + worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock"); + worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup"); + worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval"); + worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval"); + worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry"); + worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process"); + netdata_thread_cleanup_push(health_main_cleanup, ptr); int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); @@ -747,6 +770,7 @@ void *health_main(void *ptr) { marked_aclk_reload_loop = loop; #endif + worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); rrd_rdlock(); RRDHOST *host; @@ -776,6 +800,7 @@ void *health_main(void *ptr) { init_pending_foreach_alarms(host); + worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); rrdhost_rdlock(host); // the first loop is to lookup values from the db @@ -790,6 +815,7 @@ void *health_main(void *ptr) { rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && now > (rc->rrdset->last_collected_time.tv_sec + 60))) { if (!rrdcalc_isrepeating(rc)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); time_t now = now_realtime_sec(); ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, @@ -804,11 +830,10 @@ void *health_main(void *ptr) { rc->value = NAN; #if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) - sql_queue_removed_alerts_to_aclk(host); + sql_queue_alarm_to_aclk(host, ae, 1); #endif } } - continue; } if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { @@ -825,6 +850,8 @@ void *health_main(void *ptr) { // if there is database lookup, do it if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); + /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; @@ -881,6 +908,8 @@ void *health_main(void *ptr) { // if there is calculation expression, run it if (unlikely(rc->calculation)) { + worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); + if (unlikely(!expression_evaluate(rc->calculation))) { // calculation failed rc->value = NAN; @@ -929,6 +958,8 @@ void *health_main(void *ptr) { // check the warning expression if (likely(rc->warning)) { + worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); + if (unlikely(!expression_evaluate(rc->warning))) { // calculation failed rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; @@ -953,6 +984,8 @@ void *health_main(void *ptr) { // check the critical expression if (likely(rc->critical)) { + worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); + if (unlikely(!expression_evaluate(rc->critical))) { // calculation failed rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; @@ -1010,6 +1043,7 @@ void *health_main(void *ptr) { // check if the new status and the old differ if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); int delay = 0; // apply trigger hysteresis @@ -1041,19 +1075,19 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; - if(likely(!rrdcalc_isrepeating(rc))) { - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - ); - health_alarm_log(host, ae); - } + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + health_alarm_log(host, ae); + rc->last_status_change = now; rc->old_status = rc->status; rc->status = status; @@ -1091,7 +1125,9 @@ void *health_main(void *ptr) { } if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); rc->last_repeat = now; + if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, @@ -1122,6 +1158,7 @@ void *health_main(void *ptr) { // execute notifications // and cleanup + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); health_alarm_log_process(host); if (unlikely(netdata_exit)) { @@ -1160,6 +1197,7 @@ void *health_main(void *ptr) { now = now_realtime_sec(); if(now < next_run) { + worker_is_idle(); debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); now = now_realtime_sec(); |