diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 269 |
1 files changed, 165 insertions, 104 deletions
diff --git a/health/health.c b/health/health.c index e04debb9..eeed3a67 100644 --- a/health/health.c +++ b/health/health.c @@ -22,6 +22,35 @@ char *silencers_filename; SIMPLE_PATTERN *conf_enabled_alarms = NULL; DICTIONARY *health_rrdvars; +void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) { + buffer_json_member_add_array(wb, key); + + if(flags & HEALTH_ENTRY_FLAG_PROCESSED) + buffer_json_add_array_item_string(wb, "PROCESSED"); + if(flags & HEALTH_ENTRY_FLAG_UPDATED) + buffer_json_add_array_item_string(wb, "UPDATED"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN) + buffer_json_add_array_item_string(wb, "EXEC_RUN"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED) + buffer_json_add_array_item_string(wb, "EXEC_FAILED"); + if(flags & HEALTH_ENTRY_FLAG_SILENCED) + buffer_json_add_array_item_string(wb, "SILENCED"); + if(flags & HEALTH_ENTRY_RUN_ONCE) + buffer_json_add_array_item_string(wb, "RUN_ONCE"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS) + buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS"); + if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING) + buffer_json_add_array_item_string(wb, "RECURRING"); + if(flags & HEALTH_ENTRY_FLAG_SAVED) + buffer_json_add_array_item_string(wb, "SAVED"); + if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED) + buffer_json_add_array_item_string(wb, "ACLK_QUEUED"); + if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION) + buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION"); + + buffer_json_array_close(wb); +} + static bool prepare_command(BUFFER *wb, const char *exec, const char *recipient, @@ -52,8 +81,9 @@ static bool prepare_command(BUFFER *wb, const char *crit_alarms, const char *classification, const char *edit_command, - const char *machine_guid) -{ + const char *machine_guid, + uuid_t *transition_id +) { char buf[8192]; size_t n = 8192 - 1; @@ -159,6 +189,12 @@ static bool prepare_command(BUFFER *wb, return false; buffer_sprintf(wb, " '%s'", buf); + char tr_id[UUID_STR_LEN]; + uuid_unparse_lower(*transition_id, tr_id); + if (!sanitize_command_argument_string(buf, tr_id, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + return true; } @@ -257,22 +293,22 @@ static void health_silencers_init(void) { if (copied == (length* sizeof(char))) { str[length] = 0x00; json_parse(str, NULL, health_silencers_json_read_callback); - info("Parsed health silencers file %s", silencers_filename); + netdata_log_info("Parsed health silencers file %s", silencers_filename); } else { - error("Cannot read the data from health silencers file %s", silencers_filename); + netdata_log_error("Cannot read the data from health silencers file %s", silencers_filename); } freez(str); } } else { - error( - "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.", - silencers_filename, - (int64_t)length, - HEALTH_SILENCERS_MAX_FILE_LEN); + netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.", + silencers_filename, + (int64_t)length, + HEALTH_SILENCERS_MAX_FILE_LEN); } fclose(fd); } else { - info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename); + netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.", + silencers_filename); } } @@ -282,10 +318,10 @@ static void health_silencers_init(void) { * Initialize the health thread. */ void health_init(void) { - debug(D_HEALTH, "Health configuration initializing"); + netdata_log_debug(D_HEALTH, "Health configuration initializing"); if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) { - debug(D_HEALTH, "Health is disabled."); + netdata_log_debug(D_HEALTH, "Health is disabled."); return; } @@ -306,7 +342,7 @@ static void health_reload_host(RRDHOST *host) { if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; - log_health("[%s]: Reloading health.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host)); char *user_path = health_user_config_dir(); char *stock_path = health_stock_config_dir(); @@ -316,13 +352,13 @@ static void health_reload_host(RRDHOST *host) { rrdcalctemplate_delete_all(host); // invalidate all previous entries in the alarm log - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); ALARM_ENTRY *t; for(t = host->health_log.alarms ; t ; t = t->next) { if(t->new_status != RRDCALC_STATUS_REMOVED) t->flags |= HEALTH_ENTRY_FLAG_UPDATED; } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); // reset all thresholds to all charts RRDSET *st; @@ -349,7 +385,7 @@ static void health_reload_host(RRDHOST *host) { rrdset_foreach_done(st); #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (likely(wc)) { wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; @@ -397,14 +433,15 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { // do not send notifications for internal statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { // do not send notifications for disabled statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); - log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + // mark it as run, so that we will send the same alarm if it happens again goto done; } @@ -418,11 +455,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if (likely(ret == 1)) { // we have executed this alarm notification in the past - if(last_executed_status == ae->new_status) { + if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) { // don't send the notification for the same status again - debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) + netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); - log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae) + netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); goto done; } @@ -432,7 +469,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // so, don't send CLEAR notifications if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { - debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" + netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } @@ -442,11 +479,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // Check if alarm notifications are silenced if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { - log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } - log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec); const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient); @@ -533,7 +570,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->old_value, ae->source?ae_source(ae):"UNKNOWN", (uint32_t)ae->duration, - (uint32_t)ae->non_clear_duration, + (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration, ae_units(ae), ae_info(ae), ae_new_value_string(ae), @@ -546,20 +583,21 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { buffer_tostring(crit_alarms), ae->classification?ae_classification(ae):"Unknown", edit_command, - host != localhost ? host->machine_guid:""); + host->machine_guid, + &ae->transition_id); const char *command_to_run = buffer_tostring(wb); if (ok) { ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ - debug(D_HEALTH, "executing command '%s'", command_to_run); + netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); health_alarm_log_save(host, ae); } else { - error("Failed to format command arguments"); + netdata_log_error("Failed to format command arguments"); } buffer_free(wb); @@ -578,7 +616,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { return; spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp); - debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); + netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; if(ae->exec_code != 0) @@ -588,7 +626,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { - debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", + netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae), ae->new_value, rrdcalc_status2string(ae->old_status), @@ -602,31 +640,29 @@ static inline void health_alarm_log_process(RRDHOST *host) { uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0; time_t now = now_realtime_sec(); - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { - if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { - if(unlikely( + if(unlikely( !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) - )) { - if(unlikely(ae->unique_id < first_waiting)) - first_waiting = ae->unique_id; + )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; - if(likely(now >= ae->delay_up_to_timestamp)) - health_process_notifications(host, ae); - } + if(likely(now >= ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); } } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); // remember this for the next iteration host->health_last_processed_id = first_waiting; //delete those that are updated, no in progress execution, and is not repeating - netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_lock(&host->health_log.spinlock); ALARM_ENTRY *prev = NULL, *next = NULL; for(ae = host->health_log.alarms; ae ; ae = next) { @@ -639,7 +675,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { || ((ae->new_status == RRDCALC_STATUS_REMOVED) && (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && - (ae->when + 3600 < now_realtime_sec()))) + (ae->when + 86400 < now_realtime_sec()))) { if(host->health_log.alarms == ae) { @@ -658,12 +694,12 @@ static inline void health_alarm_log_process(RRDHOST *host) { prev = ae; } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_unlock(&host->health_log.spinlock); } static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { if(unlikely(!rc->rrdset)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -674,27 +710,27 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) *next_run = rc->next_update; } - debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); + netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); return 0; } if(unlikely(!rc->update_every)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -703,7 +739,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) time_t last = rrdset_last_entry_s(rc->rrdset); if(unlikely(now + update_every < first /* || now - update_every > last */)) { - debug(D_HEALTH + netdata_log_debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)." , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first , (unsigned long) last); @@ -714,7 +750,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) time_t needed = now + rc->before + rc->after; if(needed + update_every < first || needed - update_every > last) { - debug(D_HEALTH + netdata_log_debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first , (unsigned long) last); @@ -747,10 +783,10 @@ static void health_main_cleanup(void *ptr) { struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - info("cleaning up..."); + netdata_log_info("cleaning up..."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; - log_health("Health thread ended."); + netdata_log_health("Health thread ended."); } static void initialize_health(RRDHOST *host) @@ -762,7 +798,7 @@ static void initialize_health(RRDHOST *host) rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); - log_health("[%s]: Initializing health.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host)); host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); @@ -775,16 +811,32 @@ static void initialize_health(RRDHOST *host) long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); if(n < 10) { - error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); + netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); } else host->health_log.max = (unsigned int)n; + uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY); + if (m < HEALTH_LOG_MINIMUM_HISTORY) { + netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY); + config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY); + m = HEALTH_LOG_MINIMUM_HISTORY; + } + + //default health log history is 5 days and not less than a day + if (host->health_log.health_log_history) { + if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY) + host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY; + } else + host->health_log.health_log_history = m; + + netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400); + conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT, true); - netdata_rwlock_init(&host->health_log.alarm_log_rwlock); + rw_spinlock_init(&host->health_log.spinlock); char filename[FILENAME_MAX + 1]; @@ -794,7 +846,6 @@ static void initialize_health(RRDHOST *host) // TODO: This needs to go to the metadata thread // Health should wait before accessing the table (needs to be created by the metadata thread) - sql_create_health_log_table(host); sql_health_alarm_log_load(host); // ------------------------------------------------------------------------ @@ -821,20 +872,20 @@ static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) { time_t now = now_realtime_sec(); if(now < next_run) { worker_is_idle(); - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); while (now < next_run && service_running(SERVICE_HEALTH)) { sleep_usec(USEC_PER_SEC); now = now_realtime_sec(); } } else { - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); } } static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) { SILENCER *s; - debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", + netdata_log_debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):""); for (s = silencers->silencers; s!=NULL; s=s->next){ @@ -845,11 +896,11 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) && (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family))) ) { - debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); + netdata_log_debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); if (unlikely(silencers->stype == STYPE_NONE)) { - debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); } else { - debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" + netdata_log_debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" , rrdcalc_name(rc) , (rc->rrdset)?rrdset_context(rc->rrdset):"" @@ -888,7 +939,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { } if (rrdcalc_flags_old != rc->run_flags) { - info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", + netdata_log_info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", rrdhost_hostname(host), rrdcalc_name(rc), (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", @@ -905,7 +956,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) { #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (unlikely(!wc)) { return; @@ -1001,7 +1052,7 @@ void *health_main(void *ptr) { while(service_running(SERVICE_HEALTH)) { loop++; - debug(D_HEALTH, "Health monitoring iteration no %u started", loop); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop); time_t now = now_realtime_sec(); int runnable = 0, apply_hibernation_delay = 0; @@ -1012,7 +1063,7 @@ void *health_main(void *ptr) { if (unlikely(check_if_resumed_from_suspension())) { apply_hibernation_delay = 1; - log_health( + netdata_log_health( "Postponing alarm checks for %"PRId64" seconds, " "because it seems that the system was just resumed from suspension.", (int64_t)hibernation_delay); @@ -1021,7 +1072,7 @@ void *health_main(void *ptr) { if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { static int logged=0; if (!logged) { - log_health("Skipping health checks, because all alarms are disabled via a %s command.", + netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.", HEALTH_CMDAPI_CMD_DISABLEALL); logged = 1; } @@ -1044,7 +1095,7 @@ void *health_main(void *ptr) { rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); if (unlikely(apply_hibernation_delay)) { - log_health( + netdata_log_health( "[%s]: Postponing health checks for %"PRId64" seconds.", rrdhost_hostname(host), (int64_t)hibernation_delay); @@ -1057,20 +1108,20 @@ void *health_main(void *ptr) { continue; } - log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); host->health.health_delay_up_to = 0; } // wait until cleanup of obsolete charts on children is complete if (host != localhost) { if (unlikely(host->trigger_chart_obsoletion_check == 1)) { - log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); continue; } } if (!health_running_logged) { - log_health("[%s]: Health is running.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host)); health_running_logged = true; } @@ -1127,11 +1178,13 @@ void *health_main(void *ptr) { rc->old_status = rc->status; rc->status = RRDCALC_STATUS_REMOVED; rc->last_status_change = now; + rc->last_status_change_value = rc->value; rc->last_updated = now; rc->value = NAN; + rc->ae = ae; #ifdef ENABLE_ACLK - if (netdata_cloud_setting) + if (netdata_cloud_enabled) sql_queue_alarm_to_aclk(host, ae, 1); #endif } @@ -1170,7 +1223,7 @@ void *health_main(void *ptr) { rc->value = NAN; rc->run_flags |= RRDCALC_FLAG_DB_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret ); } else @@ -1181,14 +1234,14 @@ void *health_main(void *ptr) { rc->value = NAN; rc->run_flags |= RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc) ); } else rc->run_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value ); } @@ -1204,14 +1257,14 @@ void *health_main(void *ptr) { rc->value = NAN; rc->run_flags |= RRDCALC_FLAG_CALC_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) ); } else { rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " NETDATA_DOUBLE_FORMAT ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->calculation->parsed_as, rc->calculation->result, @@ -1248,14 +1301,14 @@ void *health_main(void *ptr) { // calculation failed rc->run_flags |= RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), buffer_tostring(rc->warning->error_msg) ); } else { rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " NETDATA_DOUBLE_FORMAT ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc) @@ -1274,14 +1327,14 @@ void *health_main(void *ptr) { // calculation failed rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), buffer_tostring(rc->critical->error_msg) ); } else { rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " NETDATA_DOUBLE_FORMAT ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg), @@ -1297,36 +1350,37 @@ void *health_main(void *ptr) { RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; switch (warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; - default: - break; + default: + break; } switch (critical_status) { - case RRDCALC_STATUS_CLEAR: - if (status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; - default: - break; + default: + break; } // -------------------------------------------------------- // check if the new status and the old differ if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); int delay = 0; @@ -1392,11 +1446,19 @@ void *health_main(void *ptr) { health_alarm_log_add_entry(host, ae); - log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + rc->last_status_change_value = rc->value; rc->last_status_change = now; rc->old_status = rc->status; rc->status = status; + rc->ae = ae; + + if(unlikely(rrdcalc_isrepeating(rc))) { + rc->last_repeat = now; + if (rc->status == RRDCALC_STATUS_CLEAR) + rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; + } } rc->last_updated = now; @@ -1437,7 +1499,6 @@ void *health_main(void *ptr) { worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); rc->last_repeat = now; if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; - ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, @@ -1475,7 +1536,7 @@ void *health_main(void *ptr) { } rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; health_process_notifications(host, ae); - debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + netdata_log_debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); } @@ -1503,7 +1564,7 @@ void *health_main(void *ptr) { break; } #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (unlikely(!wc)) { continue; |