diff options
Diffstat (limited to 'health')
31 files changed, 400 insertions, 337 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index 0ef55c75..20e00086 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -44,6 +44,7 @@ dist_healthconfig_DATA = \ health.d/elasticsearch.conf \ health.d/entropy.conf \ health.d/exporting.conf \ + health.d/file_descriptors.conf \ health.d/geth.conf \ health.d/ioping.conf \ health.d/gearman.conf \ diff --git a/health/health.c b/health/health.c index e04debb9..eeed3a67 100644 --- a/health/health.c +++ b/health/health.c @@ -22,6 +22,35 @@ char *silencers_filename; SIMPLE_PATTERN *conf_enabled_alarms = NULL; DICTIONARY *health_rrdvars; +void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) { + buffer_json_member_add_array(wb, key); + + if(flags & HEALTH_ENTRY_FLAG_PROCESSED) + buffer_json_add_array_item_string(wb, "PROCESSED"); + if(flags & HEALTH_ENTRY_FLAG_UPDATED) + buffer_json_add_array_item_string(wb, "UPDATED"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN) + buffer_json_add_array_item_string(wb, "EXEC_RUN"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED) + buffer_json_add_array_item_string(wb, "EXEC_FAILED"); + if(flags & HEALTH_ENTRY_FLAG_SILENCED) + buffer_json_add_array_item_string(wb, "SILENCED"); + if(flags & HEALTH_ENTRY_RUN_ONCE) + buffer_json_add_array_item_string(wb, "RUN_ONCE"); + if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS) + buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS"); + if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING) + buffer_json_add_array_item_string(wb, "RECURRING"); + if(flags & HEALTH_ENTRY_FLAG_SAVED) + buffer_json_add_array_item_string(wb, "SAVED"); + if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED) + buffer_json_add_array_item_string(wb, "ACLK_QUEUED"); + if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION) + buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION"); + + buffer_json_array_close(wb); +} + static bool prepare_command(BUFFER *wb, const char *exec, const char *recipient, @@ -52,8 +81,9 @@ static bool prepare_command(BUFFER *wb, const char *crit_alarms, const char *classification, const char *edit_command, - const char *machine_guid) -{ + const char *machine_guid, + uuid_t *transition_id +) { char buf[8192]; size_t n = 8192 - 1; @@ -159,6 +189,12 @@ static bool prepare_command(BUFFER *wb, return false; buffer_sprintf(wb, " '%s'", buf); + char tr_id[UUID_STR_LEN]; + uuid_unparse_lower(*transition_id, tr_id); + if (!sanitize_command_argument_string(buf, tr_id, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + return true; } @@ -257,22 +293,22 @@ static void health_silencers_init(void) { if (copied == (length* sizeof(char))) { str[length] = 0x00; json_parse(str, NULL, health_silencers_json_read_callback); - info("Parsed health silencers file %s", silencers_filename); + netdata_log_info("Parsed health silencers file %s", silencers_filename); } else { - error("Cannot read the data from health silencers file %s", silencers_filename); + netdata_log_error("Cannot read the data from health silencers file %s", silencers_filename); } freez(str); } } else { - error( - "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.", - silencers_filename, - (int64_t)length, - HEALTH_SILENCERS_MAX_FILE_LEN); + netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.", + silencers_filename, + (int64_t)length, + HEALTH_SILENCERS_MAX_FILE_LEN); } fclose(fd); } else { - info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename); + netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.", + silencers_filename); } } @@ -282,10 +318,10 @@ static void health_silencers_init(void) { * Initialize the health thread. */ void health_init(void) { - debug(D_HEALTH, "Health configuration initializing"); + netdata_log_debug(D_HEALTH, "Health configuration initializing"); if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) { - debug(D_HEALTH, "Health is disabled."); + netdata_log_debug(D_HEALTH, "Health is disabled."); return; } @@ -306,7 +342,7 @@ static void health_reload_host(RRDHOST *host) { if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; - log_health("[%s]: Reloading health.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host)); char *user_path = health_user_config_dir(); char *stock_path = health_stock_config_dir(); @@ -316,13 +352,13 @@ static void health_reload_host(RRDHOST *host) { rrdcalctemplate_delete_all(host); // invalidate all previous entries in the alarm log - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); ALARM_ENTRY *t; for(t = host->health_log.alarms ; t ; t = t->next) { if(t->new_status != RRDCALC_STATUS_REMOVED) t->flags |= HEALTH_ENTRY_FLAG_UPDATED; } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); // reset all thresholds to all charts RRDSET *st; @@ -349,7 +385,7 @@ static void health_reload_host(RRDHOST *host) { rrdset_foreach_done(st); #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (likely(wc)) { wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; @@ -397,14 +433,15 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { // do not send notifications for internal statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { // do not send notifications for disabled statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); - log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + // mark it as run, so that we will send the same alarm if it happens again goto done; } @@ -418,11 +455,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if (likely(ret == 1)) { // we have executed this alarm notification in the past - if(last_executed_status == ae->new_status) { + if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) { // don't send the notification for the same status again - debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) + netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); - log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae) + netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); goto done; } @@ -432,7 +469,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // so, don't send CLEAR notifications if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { - debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" + netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } @@ -442,11 +479,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // Check if alarm notifications are silenced if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { - log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } - log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec); const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient); @@ -533,7 +570,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->old_value, ae->source?ae_source(ae):"UNKNOWN", (uint32_t)ae->duration, - (uint32_t)ae->non_clear_duration, + (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration, ae_units(ae), ae_info(ae), ae_new_value_string(ae), @@ -546,20 +583,21 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { buffer_tostring(crit_alarms), ae->classification?ae_classification(ae):"Unknown", edit_command, - host != localhost ? host->machine_guid:""); + host->machine_guid, + &ae->transition_id); const char *command_to_run = buffer_tostring(wb); if (ok) { ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ - debug(D_HEALTH, "executing command '%s'", command_to_run); + netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run); ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); enqueue_alarm_notify_in_progress(ae); health_alarm_log_save(host, ae); } else { - error("Failed to format command arguments"); + netdata_log_error("Failed to format command arguments"); } buffer_free(wb); @@ -578,7 +616,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { return; spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp); - debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); + netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code); ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; if(ae->exec_code != 0) @@ -588,7 +626,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { - debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", + netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae), ae->new_value, rrdcalc_status2string(ae->old_status), @@ -602,31 +640,29 @@ static inline void health_alarm_log_process(RRDHOST *host) { uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0; time_t now = now_realtime_sec(); - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { - if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { - if(unlikely( + if(unlikely( !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) - )) { - if(unlikely(ae->unique_id < first_waiting)) - first_waiting = ae->unique_id; + )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; - if(likely(now >= ae->delay_up_to_timestamp)) - health_process_notifications(host, ae); - } + if(likely(now >= ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); } } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); // remember this for the next iteration host->health_last_processed_id = first_waiting; //delete those that are updated, no in progress execution, and is not repeating - netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_lock(&host->health_log.spinlock); ALARM_ENTRY *prev = NULL, *next = NULL; for(ae = host->health_log.alarms; ae ; ae = next) { @@ -639,7 +675,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { || ((ae->new_status == RRDCALC_STATUS_REMOVED) && (ae->flags & HEALTH_ENTRY_FLAG_SAVED) && - (ae->when + 3600 < now_realtime_sec()))) + (ae->when + 86400 < now_realtime_sec()))) { if(host->health_log.alarms == ae) { @@ -658,12 +694,12 @@ static inline void health_alarm_log_process(RRDHOST *host) { prev = ae; } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_unlock(&host->health_log.spinlock); } static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { if(unlikely(!rc->rrdset)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -674,27 +710,27 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) *next_run = rc->next_update; } - debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); + netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); return 0; } if(unlikely(!rc->update_every)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -703,7 +739,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) time_t last = rrdset_last_entry_s(rc->rrdset); if(unlikely(now + update_every < first /* || now - update_every > last */)) { - debug(D_HEALTH + netdata_log_debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)." , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first , (unsigned long) last); @@ -714,7 +750,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) time_t needed = now + rc->before + rc->after; if(needed + update_every < first || needed - update_every > last) { - debug(D_HEALTH + netdata_log_debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first , (unsigned long) last); @@ -747,10 +783,10 @@ static void health_main_cleanup(void *ptr) { struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; - info("cleaning up..."); + netdata_log_info("cleaning up..."); static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; - log_health("Health thread ended."); + netdata_log_health("Health thread ended."); } static void initialize_health(RRDHOST *host) @@ -762,7 +798,7 @@ static void initialize_health(RRDHOST *host) rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); - log_health("[%s]: Initializing health.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host)); host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); @@ -775,16 +811,32 @@ static void initialize_health(RRDHOST *host) long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); if(n < 10) { - error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); + netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); } else host->health_log.max = (unsigned int)n; + uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY); + if (m < HEALTH_LOG_MINIMUM_HISTORY) { + netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY); + config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY); + m = HEALTH_LOG_MINIMUM_HISTORY; + } + + //default health log history is 5 days and not less than a day + if (host->health_log.health_log_history) { + if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY) + host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY; + } else + host->health_log.health_log_history = m; + + netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400); + conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL, SIMPLE_PATTERN_EXACT, true); - netdata_rwlock_init(&host->health_log.alarm_log_rwlock); + rw_spinlock_init(&host->health_log.spinlock); char filename[FILENAME_MAX + 1]; @@ -794,7 +846,6 @@ static void initialize_health(RRDHOST *host) // TODO: This needs to go to the metadata thread // Health should wait before accessing the table (needs to be created by the metadata thread) - sql_create_health_log_table(host); sql_health_alarm_log_load(host); // ------------------------------------------------------------------------ @@ -821,20 +872,20 @@ static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) { time_t now = now_realtime_sec(); if(now < next_run) { worker_is_idle(); - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); while (now < next_run && service_running(SERVICE_HEALTH)) { sleep_usec(USEC_PER_SEC); now = now_realtime_sec(); } } else { - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); } } static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) { SILENCER *s; - debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", + netdata_log_debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):""); for (s = silencers->silencers; s!=NULL; s=s->next){ @@ -845,11 +896,11 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) && (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family))) ) { - debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); + netdata_log_debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); if (unlikely(silencers->stype == STYPE_NONE)) { - debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); + netdata_log_debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); } else { - debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" + netdata_log_debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" , rrdcalc_name(rc) , (rc->rrdset)?rrdset_context(rc->rrdset):"" @@ -888,7 +939,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { } if (rrdcalc_flags_old != rc->run_flags) { - info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", + netdata_log_info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", rrdhost_hostname(host), rrdcalc_name(rc), (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", @@ -905,7 +956,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) { #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (unlikely(!wc)) { return; @@ -1001,7 +1052,7 @@ void *health_main(void *ptr) { while(service_running(SERVICE_HEALTH)) { loop++; - debug(D_HEALTH, "Health monitoring iteration no %u started", loop); + netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop); time_t now = now_realtime_sec(); int runnable = 0, apply_hibernation_delay = 0; @@ -1012,7 +1063,7 @@ void *health_main(void *ptr) { if (unlikely(check_if_resumed_from_suspension())) { apply_hibernation_delay = 1; - log_health( + netdata_log_health( "Postponing alarm checks for %"PRId64" seconds, " "because it seems that the system was just resumed from suspension.", (int64_t)hibernation_delay); @@ -1021,7 +1072,7 @@ void *health_main(void *ptr) { if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { static int logged=0; if (!logged) { - log_health("Skipping health checks, because all alarms are disabled via a %s command.", + netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.", HEALTH_CMDAPI_CMD_DISABLEALL); logged = 1; } @@ -1044,7 +1095,7 @@ void *health_main(void *ptr) { rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); if (unlikely(apply_hibernation_delay)) { - log_health( + netdata_log_health( "[%s]: Postponing health checks for %"PRId64" seconds.", rrdhost_hostname(host), (int64_t)hibernation_delay); @@ -1057,20 +1108,20 @@ void *health_main(void *ptr) { continue; } - log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); host->health.health_delay_up_to = 0; } // wait until cleanup of obsolete charts on children is complete if (host != localhost) { if (unlikely(host->trigger_chart_obsoletion_check == 1)) { - log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); continue; } } if (!health_running_logged) { - log_health("[%s]: Health is running.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host)); health_running_logged = true; } @@ -1127,11 +1178,13 @@ void *health_main(void *ptr) { rc->old_status = rc->status; rc->status = RRDCALC_STATUS_REMOVED; rc->last_status_change = now; + rc->last_status_change_value = rc->value; rc->last_updated = now; rc->value = NAN; + rc->ae = ae; #ifdef ENABLE_ACLK - if (netdata_cloud_setting) + if (netdata_cloud_enabled) sql_queue_alarm_to_aclk(host, ae, 1); #endif } @@ -1170,7 +1223,7 @@ void *health_main(void *ptr) { rc->value = NAN; rc->run_flags |= RRDCALC_FLAG_DB_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret ); } else @@ -1181,14 +1234,14 @@ void *health_main(void *ptr) { rc->value = NAN; rc->run_flags |= RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc) ); } else rc->run_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value ); } @@ -1204,14 +1257,14 @@ void *health_main(void *ptr) { rc->value = NAN; rc->run_flags |= RRDCALC_FLAG_CALC_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) ); } else { rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " NETDATA_DOUBLE_FORMAT ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->calculation->parsed_as, rc->calculation->result, @@ -1248,14 +1301,14 @@ void *health_main(void *ptr) { // calculation failed rc->run_flags |= RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), buffer_tostring(rc->warning->error_msg) ); } else { rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " NETDATA_DOUBLE_FORMAT ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc) @@ -1274,14 +1327,14 @@ void *health_main(void *ptr) { // calculation failed rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), buffer_tostring(rc->critical->error_msg) ); } else { rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " + netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " NETDATA_DOUBLE_FORMAT ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg), @@ -1297,36 +1350,37 @@ void *health_main(void *ptr) { RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; switch (warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; - default: - break; + default: + break; } switch (critical_status) { - case RRDCALC_STATUS_CLEAR: - if (status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; - default: - break; + default: + break; } // -------------------------------------------------------- // check if the new status and the old differ if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); int delay = 0; @@ -1392,11 +1446,19 @@ void *health_main(void *ptr) { health_alarm_log_add_entry(host, ae); - log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + rc->last_status_change_value = rc->value; rc->last_status_change = now; rc->old_status = rc->status; rc->status = status; + rc->ae = ae; + + if(unlikely(rrdcalc_isrepeating(rc))) { + rc->last_repeat = now; + if (rc->status == RRDCALC_STATUS_CLEAR) + rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; + } } rc->last_updated = now; @@ -1437,7 +1499,6 @@ void *health_main(void *ptr) { worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); rc->last_repeat = now; if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; - ALARM_ENTRY *ae = health_create_alarm_entry( host, rc->id, @@ -1475,7 +1536,7 @@ void *health_main(void *ptr) { } rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; health_process_notifications(host, ae); - debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + netdata_log_debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); } @@ -1503,7 +1564,7 @@ void *health_main(void *ptr) { break; } #ifdef ENABLE_ACLK - if (netdata_cloud_setting) { + if (netdata_cloud_enabled) { struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config; if (unlikely(!wc)) { continue; diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 65f1a69a..7a0afcd1 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -11,7 +11,6 @@ component: UPS units: % every: 1m warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 10m multiplier 1.5 max 1h info: average UPS load over the last 10 minutes to: sitemgr @@ -29,7 +28,7 @@ component: UPS units: % every: 60s warn: $this < 100 - crit: $this < (($status == $CRITICAL) ? (60) : (50)) + crit: $this < 40 delay: down 10m multiplier 1.5 max 1h info: average UPS charge over the last minute to: sitemgr @@ -43,7 +42,6 @@ component: UPS device every: 10s units: seconds ago warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sitemgr diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index 49cb5ad0..3f92e80d 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -22,8 +22,7 @@ component: Disk calc: $dirty + $metadata + $undefined units: % every: 1m - warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) - crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) + warn: $this > 75 delay: up 1m down 1h multiplier 1.5 max 2h info: percentage of cache space used for dirty data and metadata \ (this usually means your SSD cache is too small) diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 13ac8c18..4ee8bc0b 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -8,8 +8,7 @@ component: Beanstalk calc: $buried units: jobs every: 10s - warn: $this > 0 - crit: $this > 10 + warn: $this > 3 delay: up 0 down 5m multiplier 1.2 max 1h info: number of buried jobs across all tubes. \ You need to manually kick them so they can be processed. \ diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index 7c09225f..b3e75a23 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -7,6 +7,5 @@ component: BIND every: 60 calc: $stats_size warn: $this > 512 - crit: $this > 1024 info: BIND statistics-file size to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 6f37787d..b7dcbe31 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -12,7 +12,6 @@ component: BOINC units: tasks every: 1m warn: $this > 0 - crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h info: average number of compute errors over the last 10 minutes to: sysadmin @@ -29,7 +28,6 @@ component: BOINC units: tasks every: 1m warn: $this > 0 - crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h info: average number of failed uploads over the last 10 minutes to: sysadmin @@ -46,7 +44,6 @@ component: BOINC units: tasks every: 1m warn: $this < 1 - crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h info: average number of total tasks over the last 10 minutes to: sysadmin @@ -64,7 +61,6 @@ component: BOINC units: tasks every: 1m warn: $this < 1 - crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h info: average number of active tasks over the last 10 minutes to: sysadmin diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 08260ff6..f625e545 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -64,7 +64,6 @@ component: Network every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute @@ -83,7 +82,6 @@ component: CPU units: % every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cgroup CPU utilization over the last 10 minutes to: sysadmin @@ -134,7 +132,6 @@ component: Network every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf index dff6d2df..7edca656 100644 --- a/health/health.d/consul.conf +++ b/health/health.d/consul.conf @@ -10,7 +10,7 @@ component: Consul units: seconds warn: $this < 14*24*60*60 crit: $this < 7*24*60*60 - info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter} + info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_autopilot_health_status diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index ad695282..907d6ff8 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -28,7 +28,6 @@ component: CPU units: % every: 1m warn: $this > (($status >= $WARNING) ? (20) : (40)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) delay: down 15m multiplier 1.5 max 1h info: average CPU iowait time over the last 10 minutes to: sysadmin @@ -44,7 +43,6 @@ component: CPU units: % every: 5m warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) delay: down 1h multiplier 1.5 max 2h info: average CPU steal time over the last 20 minutes to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index 010b9459..81d37df6 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -9,7 +9,6 @@ component: Dnsmasq units: % calc: $used warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) - crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: down 5m info: DHCP range utilization to: sysadmin diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf index f1702847..01919dc0 100644 --- a/health/health.d/docker.conf +++ b/health/health.d/docker.conf @@ -6,6 +6,6 @@ component: Docker units: status every: 10s lookup: average -10s of unhealthy - crit: $this > 0 + warn: $this > 0 info: ${label:container_name} docker container health status is unhealthy to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf index 47f8e1eb..29f1e9b2 100644 --- a/health/health.d/elasticsearch.conf +++ b/health/health.d/elasticsearch.conf @@ -10,7 +10,7 @@ component: Elasticsearch lookup: average -5s unaligned of *ed every: 10s units: status - warn: $this == 1 + crit: $this == 1 delay: down 5m multiplier 1.5 max 1h info: cluster health status is red. to: sysadmin diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf new file mode 100644 index 00000000..d136ea51 --- /dev/null +++ b/health/health.d/file_descriptors.conf @@ -0,0 +1,31 @@ + # you can disable an alarm notification by setting the 'to' line to: silent + + template: system_file_descriptors_utilization + on: system.file_nr_utilization + class: Utilization + type: System + component: Processes + hosts: * + lookup: max -1m unaligned + units: % + every: 1m + crit: $this > 90 + delay: down 15m multiplier 1.5 max 1h + info: system-wide utilization of open files + to: sysadmin + + template: apps_group_file_descriptors_utilization + on: apps.fd_limit + class: Utilization + type: System +component: Process + os: linux + module: !* * + hosts: * + lookup: max -1m unaligned foreach * + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: maximum utilization of open files among all application group PIDs + to: sysadmin diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index 14010d44..580d114f 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -8,7 +8,6 @@ component: Gearman units: workers every: 10s warn: $this > 30000 - crit: $this > 100000 delay: down 5m multiplier 1.5 max 1h info: average number of queued jobs over the last 10 minutes to: sysadmin diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf index dd1eb470..361b6b41 100644 --- a/health/health.d/geth.conf +++ b/health/health.d/geth.conf @@ -8,5 +8,4 @@ component: geth calc: $chain_head_block - $chain_head_header units: blocks warn: $this != 0 - crit: $this > 5 delay: down 1m multiplier 1.5 max 1h diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 2786cbd6..47ac4453 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -6,10 +6,8 @@ component: Disk lookup: average -10s unaligned of latency units: microseconds every: 10s - green: 5000 - red: 10000 + green: 10000 warn: $this > $green - crit: $this > $red delay: down 30m multiplier 1.5 max 2h info: average I/O latency over the last 10 seconds to: sysadmin diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index c178a410..3d1b46c0 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -12,7 +12,6 @@ component: IPC units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h info: IPC semaphore utilization to: sysadmin @@ -28,7 +27,6 @@ component: IPC units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h info: IPC semaphore arrays utilization to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index feadba1b..4d6478cc 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -1,15 +1,15 @@ - alarm: ipmi_sensors_states - on: ipmi.sensors_states + template: ipmi_sensor_state + on: ipmi.sensor_state class: Errors type: System component: IPMI calc: $warning + $critical - units: sensors + units: state every: 10s - warn: $this > 0 + warn: $warning > 0 crit: $critical > 0 delay: up 5m down 15m multiplier 1.5 max 1h - info: number of IPMI sensors in non-nominal state + info: IPMI sensor ${label:sensor} (${label:component}) state to: sysadmin alarm: ipmi_events diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index c0bc6de8..4562122c 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -9,7 +9,6 @@ component: Battery units: % every: 10s warn: $this < 10 - crit: $this < 5 delay: up 30s down 5m multiplier 1.2 max 1h info: percentage of remaining power supply capacity to: sysadmin diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf index 6231dd97..67843205 100644 --- a/health/health.d/nut.conf +++ b/health/health.d/nut.conf @@ -26,8 +26,8 @@ component: UPS lookup: average -60s unaligned of battery_charge units: % every: 60s - warn: $this < 100 - crit: $this < (($status == $CRITICAL) ? (60) : (50)) + warn: $this < 75 + crit: $this < 40 delay: down 10m multiplier 1.5 max 1h info: average UPS charge over the last minute to: sitemgr diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index ee6c57cc..045930ae 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -10,8 +10,7 @@ component: Pi-hole every: 10s units: seconds calc: $ago - warn: $this > 60 * 60 * 24 * 8 - crit: $this > 60 * 60 * 24 * 8 * 2 + warn: $this > 60 * 60 * 24 * 30 info: gravity.list (blocklist) file last update time to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index ab382c43..34e5431a 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -28,7 +28,6 @@ component: Memory units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin @@ -74,7 +73,6 @@ component: Memory units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index ab110bf0..27a857fc 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -9,8 +9,8 @@ component: ScaleIO calc: $used units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) delay: down 15m multiplier 1.5 max 1h info: storage pool capacity utilization to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index a9cc7cee..bff34cd3 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -133,8 +133,7 @@ component: VMware vCenter lookup: max -10s unaligned of software_packages units: status every: 10s - warn: $this == 4 - crit: $this == 3 + warn: ($this == 3) || ($this == 4) delay: down 1m multiplier 1.5 max 1h info: software updates availability status \ (-1: unknown, 0: green, 2: orange, 3: red, 4: grey) diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf index d4bc7639..28a88638 100644 --- a/health/health.d/windows.conf +++ b/health/health.d/windows.conf @@ -6,7 +6,7 @@ class: Utilization type: Windows component: CPU - os: linux + os: * hosts: * lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt units: % @@ -25,7 +25,7 @@ component: CPU class: Utilization type: Windows component: Memory - os: linux + os: * hosts: * calc: ($used) * 100 / ($used + $available) units: % @@ -36,31 +36,15 @@ component: Memory info: memory utilization to: sysadmin - template: windows_swap_in_use - on: windows.memory_swap_utilization - class: Utilization - type: Windows -component: Memory - os: linux - hosts: * - calc: ($used) * 100 / ($used + $available) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: swap memory utilization - to: sysadmin - ## Network template: windows_inbound_packets_discarded - on: windows.net_discarded + on: windows.net_nic_discarded class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of inbound units: packets @@ -71,11 +55,11 @@ component: Network to: sysadmin template: windows_outbound_packets_discarded - on: windows.net_discarded + on: windows.net_nic_discarded class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of outbound units: packets @@ -86,11 +70,11 @@ component: Network to: sysadmin template: windows_inbound_packets_errors - on: windows.net_errors + on: windows.net_nic_errors class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of inbound units: packets @@ -101,11 +85,11 @@ component: Network to: sysadmin template: windows_outbound_packets_errors - on: windows.net_errors + on: windows.net_nic_errors class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of outbound units: packets @@ -119,11 +103,11 @@ component: Network ## Disk template: windows_disk_in_use - on: windows.logical_disk_utilization + on: windows.logical_disk_space_usage class: Utilization type: Windows component: Disk - os: linux + os: * hosts: * calc: ($used) * 100 / ($used + $free) units: % diff --git a/health/health.h b/health/health.h index c36aabac..543bc56a 100644 --- a/health/health.h +++ b/health/health.h @@ -7,18 +7,21 @@ extern unsigned int default_health_enabled; -#define HEALTH_ENTRY_FLAG_PROCESSED 0x00000001 -#define HEALTH_ENTRY_FLAG_UPDATED 0x00000002 -#define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004 -#define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008 -#define HEALTH_ENTRY_FLAG_SILENCED 0x00000010 -#define HEALTH_ENTRY_RUN_ONCE 0x00000020 -#define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040 -#define HEALTH_ENTRY_FLAG_IS_REPEATING 0x00000080 - -#define HEALTH_ENTRY_FLAG_SAVED 0x10000000 -#define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000 -#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 +typedef enum __attribute__((packed)) { + HEALTH_ENTRY_FLAG_PROCESSED = 0x00000001, // notifications engine has processed this + HEALTH_ENTRY_FLAG_UPDATED = 0x00000002, // there is a more recent update about this transition + HEALTH_ENTRY_FLAG_EXEC_RUN = 0x00000004, // notification script has been run (this is the intent, not the result) + HEALTH_ENTRY_FLAG_EXEC_FAILED = 0x00000008, // notification script couldn't be run + HEALTH_ENTRY_FLAG_SILENCED = 0x00000010, + HEALTH_ENTRY_RUN_ONCE = 0x00000020, + HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS = 0x00000040, + HEALTH_ENTRY_FLAG_IS_REPEATING = 0x00000080, + HEALTH_ENTRY_FLAG_SAVED = 0x10000000, // Saved to SQL + HEALTH_ENTRY_FLAG_ACLK_QUEUED = 0x20000000, // Sent to Netdata Cloud + HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION = 0x80000000, +} HEALTH_ENTRY_FLAGS; + +void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags); #ifndef HEALTH_LISTEN_PORT #define HEALTH_LISTEN_PORT 19998 @@ -28,6 +31,14 @@ extern unsigned int default_health_enabled; #define HEALTH_LISTEN_BACKLOG 4096 #endif +#ifndef HEALTH_LOG_DEFAULT_HISTORY +#define HEALTH_LOG_DEFAULT_HISTORY 432000 +#endif + +#ifndef HEALTH_LOG_MINIMUM_HISTORY +#define HEALTH_LOG_MINIMUM_HISTORY 86400 +#endif + #define HEALTH_SILENCERS_MAX_FILE_LEN 10000 extern char *silencers_filename; @@ -40,6 +51,7 @@ void health_reload(void); void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); +void health_alert2json_conf(RRDHOST *host, BUFFER *wb, CONTEXTS_V2_OPTIONS all); void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); @@ -73,7 +85,7 @@ ALARM_ENTRY* health_create_alarm_entry( STRING *units, STRING *info, int delay, - uint32_t flags); + HEALTH_ENTRY_FLAGS flags); void health_alarm_log_add_entry(RRDHOST *host, ALARM_ENTRY *ae); diff --git a/health/health_config.c b/health/health_config.c index a11fd51c..4e93235e 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -61,36 +61,36 @@ static inline int health_parse_delay( if(!strcasecmp(key, "up")) { if (!config_parse_duration(value, delay_up_duration)) { - error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", - line, filename, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); } else given_up = 1; } else if(!strcasecmp(key, "down")) { if (!config_parse_duration(value, delay_down_duration)) { - error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", - line, filename, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); } else given_down = 1; } else if(!strcasecmp(key, "multiplier")) { *delay_multiplier = strtof(value, NULL); if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) { - error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", - line, filename, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); } else given_multiplier = 1; } else if(!strcasecmp(key, "max")) { if (!config_parse_duration(value, delay_max_duration)) { - error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", - line, filename, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, filename, value, key); } else given_max = 1; } else { - error("Health configuration at line %zu of file '%s': unknown keyword '%s'", - line, filename, key); + netdata_log_error("Health configuration at line %zu of file '%s': unknown keyword '%s'", + line, filename, key); } } @@ -136,7 +136,7 @@ static inline uint32_t health_parse_options(const char *s) { if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear")) options |= RRDCALC_OPTION_NO_CLEAR_NOTIFICATION; else - error("Ignoring unknown alarm option '%s'", buf); + netdata_log_error("Ignoring unknown alarm option '%s'", buf); } } @@ -171,14 +171,14 @@ static inline int health_parse_repeat( } if(!strcasecmp(key, "warning")) { if (!config_parse_duration(value, (int*)warn_repeat_every)) { - error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", - line, file, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); } } else if(!strcasecmp(key, "critical")) { if (!config_parse_duration(value, (int*)crit_repeat_every)) { - error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", - line, file, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); } } } @@ -308,7 +308,7 @@ static inline int health_parse_db_lookup( RRDR_TIME_GROUPING *group_method, int *after, int *before, int *every, RRDCALC_OPTIONS *options, STRING **dimensions, STRING **foreachdim ) { - debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string); + netdata_log_debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string); if(*dimensions) string_freez(*dimensions); if(*foreachdim) string_freez(*foreachdim); @@ -326,14 +326,14 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; if(!*s) { - error("Health configuration invalid chart calculation at line %zu of file '%s': expected group method followed by the 'after' time, but got '%s'", - line, filename, key); + netdata_log_error("Health configuration invalid chart calculation at line %zu of file '%s': expected group method followed by the 'after' time, but got '%s'", + line, filename, key); return 0; } if((*group_method = time_grouping_parse(key, RRDR_GROUPING_UNDEFINED)) == RRDR_GROUPING_UNDEFINED) { - error("Health configuration at line %zu of file '%s': invalid group method '%s'", - line, filename, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid group method '%s'", + line, filename, key); return 0; } @@ -343,8 +343,8 @@ static inline int health_parse_db_lookup( while(*s && isspace(*s)) *s++ = '\0'; if(!config_parse_duration(key, after)) { - error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method", - line, filename, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method", + line, filename, key); return 0; } @@ -364,8 +364,8 @@ static inline int health_parse_db_lookup( while(*s && isspace(*s)) *s++ = '\0'; if (!config_parse_duration(value, before)) { - error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", - line, filename, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", + line, filename, value, key); } } else if(!strcasecmp(key, HEALTH_EVERY_KEY)) { @@ -374,8 +374,8 @@ static inline int health_parse_db_lookup( while(*s && isspace(*s)) *s++ = '\0'; if (!config_parse_duration(value, every)) { - error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", - line, filename, value, key); + netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", + line, filename, value, key); } } else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) { @@ -422,8 +422,8 @@ static inline int health_parse_db_lookup( break; } else { - error("Health configuration at line %zu of file '%s': unknown keyword '%s'", - line, filename, key); + netdata_log_error("Health configuration at line %zu of file '%s': unknown keyword '%s'", + line, filename, key); } } @@ -499,6 +499,7 @@ static inline void alert_config_free(struct alert_config *cfg) string_freez(cfg->p_db_lookup_dimensions); string_freez(cfg->p_db_lookup_method); string_freez(cfg->chart_labels); + string_freez(cfg->source); freez(cfg); } @@ -506,7 +507,7 @@ int sql_store_hashes = 1; static int health_readfile(const char *filename, void *data) { RRDHOST *host = (RRDHOST *)data; - debug(D_HEALTH, "Health configuration reading file '%s'", filename); + netdata_log_debug(D_HEALTH, "Health configuration reading file '%s'", filename); static uint32_t hash_alarm = 0, @@ -573,7 +574,7 @@ static int health_readfile(const char *filename, void *data) { FILE *fp = fopen(filename, "r"); if(!fp) { - error("Health configuration cannot read file '%s'.", filename); + netdata_log_error("Health configuration cannot read file '%s'.", filename); return 0; } @@ -597,7 +598,8 @@ static int health_readfile(const char *filename, void *data) { if(append < HEALTH_CONF_MAX_LINE) continue; else { - error("Health configuration has too long multi-line at line %zu of file '%s'.", line, filename); + netdata_log_error("Health configuration has too long multi-line at line %zu of file '%s'.", + line, filename); } } append = 0; @@ -605,7 +607,8 @@ static int health_readfile(const char *filename, void *data) { char *key = s; while(*s && *s != ':') s++; if(!*s) { - error("Health configuration has invalid line %zu of file '%s'. It does not contain a ':'. Ignoring it.", line, filename); + netdata_log_error("Health configuration has invalid line %zu of file '%s'. It does not contain a ':'. Ignoring it.", + line, filename); continue; } *s = '\0'; @@ -616,12 +619,14 @@ static int health_readfile(const char *filename, void *data) { value = trim_all(value); if(!key) { - error("Health configuration has invalid line %zu of file '%s'. Keyword is empty. Ignoring it.", line, filename); + netdata_log_error("Health configuration has invalid line %zu of file '%s'. Keyword is empty. Ignoring it.", + line, filename); continue; } if(!value) { - error("Health configuration has invalid line %zu of file '%s'. value is empty. Ignoring it.", line, filename); + netdata_log_error("Health configuration has invalid line %zu of file '%s'. value is empty. Ignoring it.", + line, filename); continue; } @@ -653,7 +658,7 @@ static int health_readfile(const char *filename, void *data) { { char *tmp = strdupz(value); if(rrdvar_fix_name(tmp)) - error("Health configuration renamed alarm '%s' to '%s'", value, tmp); + netdata_log_error("Health configuration renamed alarm '%s' to '%s'", value, tmp); rc->name = string_strdupz(tmp); freez(tmp); @@ -673,6 +678,7 @@ static int health_readfile(const char *filename, void *data) { alert_cfg = callocz(1, sizeof(struct alert_config)); alert_cfg->alarm = string_dup(rc->name); + alert_cfg->source = health_source_file(line, filename); ignore_this = 0; } else { rc = NULL; @@ -702,7 +708,7 @@ static int health_readfile(const char *filename, void *data) { { char *tmp = strdupz(value); if(rrdvar_fix_name(tmp)) - error("Health configuration renamed template '%s' to '%s'", value, tmp); + netdata_log_error("Health configuration renamed template '%s' to '%s'", value, tmp); rt->name = string_strdupz(tmp); freez(tmp); @@ -719,6 +725,7 @@ static int health_readfile(const char *filename, void *data) { alert_cfg = callocz(1, sizeof(struct alert_config)); alert_cfg->template_key = string_dup(rt->name); + alert_cfg->source = health_source_file(line, filename); ignore_this = 0; } else { rt = NULL; @@ -731,10 +738,10 @@ static int health_readfile(const char *filename, void *data) { if(!simple_pattern_matches_string(os_pattern, host->os)) { if(rc) - debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, os_match); + netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, os_match); if(rt) - debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, os_match); + netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, os_match); ignore_this = 1; } @@ -748,10 +755,10 @@ static int health_readfile(const char *filename, void *data) { if(!simple_pattern_matches_string(host_pattern, host->hostname)) { if(rc) - debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, host_match); + netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, host_match); if(rt) - debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, host_match); + netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, host_match); ignore_this = 1; } @@ -763,8 +770,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->on = string_strdupz(value); if(rc->chart) { if(strcmp(rrdcalc_chart_name(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_chart_name(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_chart_name(rc), value, value); string_freez(rc->chart); } @@ -776,8 +783,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->classification = string_strdupz(value); if(rc->classification) { if(strcmp(rrdcalc_classification(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_classification(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_classification(rc), value, value); string_freez(rc->classification); } @@ -789,7 +796,7 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->component = string_strdupz(value); if(rc->component) { if(strcmp(rrdcalc_component(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", line, filename, rrdcalc_name(rc), key, rrdcalc_component(rc), value, value); string_freez(rc->component); @@ -802,8 +809,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->type = string_strdupz(value); if(rc->type) { if(strcmp(rrdcalc_type(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_type(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_type(rc), value, value); string_freez(rc->type); } @@ -831,8 +838,8 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { alert_cfg->every = string_strdupz(value); if(!config_parse_duration(value, &rc->update_every)) - error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", - line, filename, rrdcalc_name(rc), key, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", + line, filename, rrdcalc_name(rc), key, value); alert_cfg->p_update_every = rc->update_every; } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { @@ -840,8 +847,8 @@ static int health_readfile(const char *filename, void *data) { char *e; rc->green = str2ndd(value, &e); if(e && *e) { - error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rrdcalc_name(rc), key, e); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", + line, filename, rrdcalc_name(rc), key, e); } } else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { @@ -849,8 +856,8 @@ static int health_readfile(const char *filename, void *data) { char *e; rc->red = str2ndd(value, &e); if(e && *e) { - error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rrdcalc_name(rc), key, e); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", + line, filename, rrdcalc_name(rc), key, e); } } else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { @@ -859,8 +866,8 @@ static int health_readfile(const char *filename, void *data) { int error = 0; rc->calculation = expression_parse(value, &failed_at, &error); if(!rc->calculation) { - error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", + line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); } parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE); } @@ -870,8 +877,8 @@ static int health_readfile(const char *filename, void *data) { int error = 0; rc->warning = expression_parse(value, &failed_at, &error); if(!rc->warning) { - error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", + line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); } parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE); } @@ -881,8 +888,8 @@ static int health_readfile(const char *filename, void *data) { int error = 0; rc->critical = expression_parse(value, &failed_at, &error); if(!rc->critical) { - error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", + line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); } parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE); } @@ -890,8 +897,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->exec = string_strdupz(value); if(rc->exec) { if(strcmp(rrdcalc_exec(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_exec(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_exec(rc), value, value); string_freez(rc->exec); } @@ -901,8 +908,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->to = string_strdupz(value); if(rc->recipient) { if(strcmp(rrdcalc_recipient(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_recipient(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_recipient(rc), value, value); string_freez(rc->recipient); } @@ -914,8 +921,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->units = string_strdupz(value); if(rc->units) { if(strcmp(rrdcalc_units(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_units(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_units(rc), value, value); string_freez(rc->units); } @@ -927,8 +934,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->info = string_strdupz(value); if(rc->info) { if(strcmp(rrdcalc_info(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalc_name(rc), key, rrdcalc_info(rc), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalc_name(rc), key, rrdcalc_info(rc), value, value); string_freez(rc->info); string_freez(rc->original_info); @@ -954,8 +961,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->host_labels = string_strdupz(value); if(rc->host_labels) { if(strcmp(rrdcalc_host_labels(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", - line, filename, rrdcalc_name(rc), key, value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", + line, filename, rrdcalc_name(rc), key, value, value); string_freez(rc->host_labels); simple_pattern_free(rc->host_labels_pattern); @@ -989,8 +996,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->chart_labels = string_strdupz(value); if(rc->chart_labels) { if(strcmp(rrdcalc_chart_labels(rc), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", - line, filename, rrdcalc_name(rc), key, value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", + line, filename, rrdcalc_name(rc), key, value, value); string_freez(rc->chart_labels); simple_pattern_free(rc->chart_labels_pattern); @@ -1007,8 +1014,8 @@ static int health_readfile(const char *filename, void *data) { true); } else { - error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", - line, filename, rrdcalc_name(rc), key); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", + line, filename, rrdcalc_name(rc), key); } } else if(rt) { @@ -1016,8 +1023,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->on = string_strdupz(value); if(rt->context) { if(strcmp(string2str(rt->context), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, string2str(rt->context), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, string2str(rt->context), value, value); string_freez(rt->context); } @@ -1029,8 +1036,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->classification = string_strdupz(value); if(rt->classification) { if(strcmp(rrdcalctemplate_classification(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_classification(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_classification(rt), value, value); string_freez(rt->classification); } @@ -1042,8 +1049,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->component = string_strdupz(value); if(rt->component) { if(strcmp(rrdcalctemplate_component(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_component(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_component(rt), value, value); string_freez(rt->component); } @@ -1055,8 +1062,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->type = string_strdupz(value); if(rt->type) { if(strcmp(rrdcalctemplate_type(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_type(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_type(rt), value, value); string_freez(rt->type); } @@ -1122,8 +1129,8 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { alert_cfg->every = string_strdupz(value); if(!config_parse_duration(value, &rt->update_every)) - error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", - line, filename, rrdcalctemplate_name(rt), key, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", + line, filename, rrdcalctemplate_name(rt), key, value); alert_cfg->p_update_every = rt->update_every; } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { @@ -1131,8 +1138,8 @@ static int health_readfile(const char *filename, void *data) { char *e; rt->green = str2ndd(value, &e); if(e && *e) { - error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rrdcalctemplate_name(rt), key, e); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", + line, filename, rrdcalctemplate_name(rt), key, e); } } else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { @@ -1140,8 +1147,8 @@ static int health_readfile(const char *filename, void *data) { char *e; rt->red = str2ndd(value, &e); if(e && *e) { - error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rrdcalctemplate_name(rt), key, e); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", + line, filename, rrdcalctemplate_name(rt), key, e); } } else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { @@ -1150,8 +1157,8 @@ static int health_readfile(const char *filename, void *data) { int error = 0; rt->calculation = expression_parse(value, &failed_at, &error); if(!rt->calculation) { - error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", + line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); } parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE); } @@ -1161,8 +1168,8 @@ static int health_readfile(const char *filename, void *data) { int error = 0; rt->warning = expression_parse(value, &failed_at, &error); if(!rt->warning) { - error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", + line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); } parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE); } @@ -1172,8 +1179,8 @@ static int health_readfile(const char *filename, void *data) { int error = 0; rt->critical = expression_parse(value, &failed_at, &error); if(!rt->critical) { - error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", + line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); } parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE); } @@ -1181,8 +1188,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->exec = string_strdupz(value); if(rt->exec) { if(strcmp(rrdcalctemplate_exec(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_exec(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_exec(rt), value, value); string_freez(rt->exec); } @@ -1192,8 +1199,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->to = string_strdupz(value); if(rt->recipient) { if(strcmp(rrdcalctemplate_recipient(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_recipient(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_recipient(rt), value, value); string_freez(rt->recipient); } @@ -1205,8 +1212,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->units = string_strdupz(value); if(rt->units) { if(strcmp(rrdcalctemplate_units(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_units(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_units(rt), value, value); string_freez(rt->units); } @@ -1218,8 +1225,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->info = string_strdupz(value); if(rt->info) { if(strcmp(rrdcalctemplate_info(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_info(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_info(rt), value, value); string_freez(rt->info); } @@ -1243,8 +1250,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->host_labels = string_strdupz(value); if(rt->host_labels) { if(strcmp(rrdcalctemplate_host_labels(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_host_labels(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_host_labels(rt), value, value); string_freez(rt->host_labels); simple_pattern_free(rt->host_labels_pattern); @@ -1263,8 +1270,8 @@ static int health_readfile(const char *filename, void *data) { alert_cfg->chart_labels = string_strdupz(value); if(rt->chart_labels) { if(strcmp(rrdcalctemplate_chart_labels(rt), value) != 0) - error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_chart_labels(rt), value, value); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_chart_labels(rt), value, value); string_freez(rt->chart_labels); simple_pattern_free(rt->chart_labels_pattern); @@ -1281,13 +1288,13 @@ static int health_readfile(const char *filename, void *data) { SIMPLE_PATTERN_EXACT, true); } else { - error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", - line, filename, rrdcalctemplate_name(rt), key); + netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", + line, filename, rrdcalctemplate_name(rt), key); } } else { - error("Health configuration at line %zu of file '%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.", - line, filename, key); + netdata_log_error("Health configuration at line %zu of file '%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.", + line, filename, key); } } @@ -1321,7 +1328,7 @@ void sql_refresh_hashes(void) void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath) { if(unlikely((!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) || !service_running(SERVICE_HEALTH)) { - debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", rrdhost_hostname(host)); + netdata_log_debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", rrdhost_hostname(host)); return; } @@ -1329,7 +1336,7 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path CONFIG_BOOLEAN_YES); if (!stock_enabled) { - log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host)); stock_path = user_path; } @@ -1337,6 +1344,6 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path health_rrdvars = health_rrdvariables_create(); recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0); - log_health("[%s]: Read health configuration.", rrdhost_hostname(host)); + netdata_log_health("[%s]: Read health configuration.", rrdhost_hostname(host)); sql_store_hashes = 0; } diff --git a/health/health_json.c b/health/health_json.c index 4f81998f..1da0f597 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -167,10 +167,6 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC buffer_strcat(wb, "\t\t}"); } -//void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) { -// -//} - void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCALC_STATUS status) { RRDCALC *rc; int numberOfAlarms = 0; diff --git a/health/health_log.c b/health/health_log.c index b62e0ace..4cfbee60 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -35,9 +35,9 @@ inline ALARM_ENTRY* health_create_alarm_entry( STRING *units, STRING *info, int delay, - uint32_t flags + HEALTH_ENTRY_FLAGS flags ) { - debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id); + netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id); ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY)); ae->name = string_dup(name); @@ -47,6 +47,7 @@ inline ALARM_ENTRY* health_create_alarm_entry( uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); uuid_generate_random(ae->transition_id); + ae->global_id = now_realtime_usec(); ae->family = string_dup(family); ae->classification = string_dup(class); @@ -88,19 +89,19 @@ inline void health_alarm_log_add_entry( RRDHOST *host, ALARM_ENTRY *ae ) { - debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); + netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); __atomic_add_fetch(&host->health_transitions, 1, __ATOMIC_RELAXED); // link it - netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_lock(&host->health_log.spinlock); ae->next = host->health_log.alarms; host->health_log.alarms = ae; host->health_log.count++; - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_unlock(&host->health_log.spinlock); // match previous alarms - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_lock(&host->health_log.spinlock); ALARM_ENTRY *t; for(t = host->health_log.alarms ; t ; t = t->next) { if(t != ae && t->alarm_id == ae->alarm_id) { @@ -120,7 +121,7 @@ inline void health_alarm_log_add_entry( break; } } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_read_unlock(&host->health_log.spinlock); health_alarm_log_save(host, ae); } @@ -144,7 +145,7 @@ inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) { } inline void health_alarm_log_free(RRDHOST *host) { - netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_lock(&host->health_log.spinlock); ALARM_ENTRY *ae; while((ae = host->health_log.alarms)) { @@ -152,5 +153,5 @@ inline void health_alarm_log_free(RRDHOST *host) { health_alarm_log_free_one_nochecks_nounlink(ae); } - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + rw_spinlock_write_unlock(&host->health_log.spinlock); } diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 51c00021..3cff33db 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -3,7 +3,7 @@ # netdata # real-time performance and health monitoring, done right! -# (C) 2017 Costa Tsaousis <costa@tsaousis.gr> +# (C) 2023 Netdata Inc. # SPDX-License-Identifier: GPL-3.0-or-later # # Script to send alarm notifications for netdata @@ -246,7 +246,8 @@ else total_crit_alarms="${26}" # List of alarms in critical state classification="${27}" # The class field from .conf files edit_command_line="${28}" # The command to edit the alarm, with the line number - child_machine_guid="${29}" # If populated, the notification is sent for a child + child_machine_guid="${29}" # the machine_guid of the child + transition_id="${30}" # the transition_id of the alert fi # ----------------------------------------------------------------------------- @@ -768,6 +769,15 @@ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then fi fi +# if we need nc, check for the nc command +if [ "${SEND_IRC}" = "YES" ] && [ -z "${nc}" ]; then + nc="$(command -v nc 2>/dev/null)" + if [ -z "${nc}" ]; then + debug "Cannot find nc command in the system path. Disabling IRC notifications." + SEND_IRC="NO" + fi +fi + if [ ${dump_methods} ]; then for name in "${!SEND_@}"; do if [ "${!name}" = "YES" ]; then @@ -1913,7 +1923,7 @@ send_irc() { SNDMESSAGE="${MESSAGE//$'\n'/", "}" for CHANNEL in ${CHANNELS}; do error=0 - send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" "${PORT}") + send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | ${nc} "${NETWORK}" "${PORT}") reply_codes=$(echo "${send_alarm}" | cut -d ' ' -f 2 | grep -o '[0-9]*') for code in ${reply_codes}; do if [ "${code}" -ge 400 ] && [ "${code}" -le 599 ]; then @@ -2479,31 +2489,17 @@ urlencode "${value_string}" >/dev/null url_value_string="${REPLY}" redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}&alarm_status=${status}&alarm_chart=${chart}&alarm_value=${url_value_string}" -GOTOCLOUD=0 - -if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then - if [ -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then - if [ -f "@registrydir_POST@/netdata.public.unique.id" ]; then - NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")" - fi - fi - if [ -n "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then - GOTOCLOUD=1 - fi -fi -if [ ${GOTOCLOUD} -eq 0 ]; then - goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}" -else - # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud - # Re-allow alarm redirection, for alarms 2.0, new template - if [ -z "${child_machine_guid}" ]; then - goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" +if [ -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then + if [ -f "@registrydir_POST@/netdata.public.unique.id" ]; then + NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")" else - goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&childId=${child_machine_guid}&${redirect_params}" + error "failed to identify this agent via its NETDATA_REGISTRY_UNIQUE_ID." fi fi +goto_url="${NETDATA_REGISTRY_URL}/registry-alert-redirect.html?agent_machine_guid=${NETDATA_REGISTRY_UNIQUE_ID}&host_machine_guid=${child_machine_guid}&transition_id=${transition_id}&${redirect_params}" + # the severity of the alarm severity="${status}" |