summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2023-07-20 04:49:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-07-20 04:49:55 +0000
commitab1bb5b7f1c3c3a7b240ab7fc8661459ecd7decb (patch)
tree7a900833aad3ccc685712c6c2a7d87576d54f427 /health/health.c
parentAdding upstream version 1.40.1. (diff)
downloadnetdata-ab1bb5b7f1c3c3a7b240ab7fc8661459ecd7decb.tar.xz
netdata-ab1bb5b7f1c3c3a7b240ab7fc8661459ecd7decb.zip
Adding upstream version 1.41.0.upstream/1.41.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c269
1 files changed, 165 insertions, 104 deletions
diff --git a/health/health.c b/health/health.c
index e04debb93..eeed3a674 100644
--- a/health/health.c
+++ b/health/health.c
@@ -22,6 +22,35 @@ char *silencers_filename;
SIMPLE_PATTERN *conf_enabled_alarms = NULL;
DICTIONARY *health_rrdvars;
+void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) {
+ buffer_json_member_add_array(wb, key);
+
+ if(flags & HEALTH_ENTRY_FLAG_PROCESSED)
+ buffer_json_add_array_item_string(wb, "PROCESSED");
+ if(flags & HEALTH_ENTRY_FLAG_UPDATED)
+ buffer_json_add_array_item_string(wb, "UPDATED");
+ if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
+ buffer_json_add_array_item_string(wb, "EXEC_RUN");
+ if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)
+ buffer_json_add_array_item_string(wb, "EXEC_FAILED");
+ if(flags & HEALTH_ENTRY_FLAG_SILENCED)
+ buffer_json_add_array_item_string(wb, "SILENCED");
+ if(flags & HEALTH_ENTRY_RUN_ONCE)
+ buffer_json_add_array_item_string(wb, "RUN_ONCE");
+ if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)
+ buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS");
+ if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING)
+ buffer_json_add_array_item_string(wb, "RECURRING");
+ if(flags & HEALTH_ENTRY_FLAG_SAVED)
+ buffer_json_add_array_item_string(wb, "SAVED");
+ if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED)
+ buffer_json_add_array_item_string(wb, "ACLK_QUEUED");
+ if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)
+ buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION");
+
+ buffer_json_array_close(wb);
+}
+
static bool prepare_command(BUFFER *wb,
const char *exec,
const char *recipient,
@@ -52,8 +81,9 @@ static bool prepare_command(BUFFER *wb,
const char *crit_alarms,
const char *classification,
const char *edit_command,
- const char *machine_guid)
-{
+ const char *machine_guid,
+ uuid_t *transition_id
+) {
char buf[8192];
size_t n = 8192 - 1;
@@ -159,6 +189,12 @@ static bool prepare_command(BUFFER *wb,
return false;
buffer_sprintf(wb, " '%s'", buf);
+ char tr_id[UUID_STR_LEN];
+ uuid_unparse_lower(*transition_id, tr_id);
+ if (!sanitize_command_argument_string(buf, tr_id, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
return true;
}
@@ -257,22 +293,22 @@ static void health_silencers_init(void) {
if (copied == (length* sizeof(char))) {
str[length] = 0x00;
json_parse(str, NULL, health_silencers_json_read_callback);
- info("Parsed health silencers file %s", silencers_filename);
+ netdata_log_info("Parsed health silencers file %s", silencers_filename);
} else {
- error("Cannot read the data from health silencers file %s", silencers_filename);
+ netdata_log_error("Cannot read the data from health silencers file %s", silencers_filename);
}
freez(str);
}
} else {
- error(
- "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
- silencers_filename,
- (int64_t)length,
- HEALTH_SILENCERS_MAX_FILE_LEN);
+ netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
+ silencers_filename,
+ (int64_t)length,
+ HEALTH_SILENCERS_MAX_FILE_LEN);
}
fclose(fd);
} else {
- info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
+ netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.",
+ silencers_filename);
}
}
@@ -282,10 +318,10 @@ static void health_silencers_init(void) {
* Initialize the health thread.
*/
void health_init(void) {
- debug(D_HEALTH, "Health configuration initializing");
+ netdata_log_debug(D_HEALTH, "Health configuration initializing");
if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
- debug(D_HEALTH, "Health is disabled.");
+ netdata_log_debug(D_HEALTH, "Health is disabled.");
return;
}
@@ -306,7 +342,7 @@ static void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
- log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host));
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
@@ -316,13 +352,13 @@ static void health_reload_host(RRDHOST *host) {
rrdcalctemplate_delete_all(host);
// invalidate all previous entries in the alarm log
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_lock(&host->health_log.spinlock);
ALARM_ENTRY *t;
for(t = host->health_log.alarms ; t ; t = t->next) {
if(t->new_status != RRDCALC_STATUS_REMOVED)
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
// reset all thresholds to all charts
RRDSET *st;
@@ -349,7 +385,7 @@ static void health_reload_host(RRDHOST *host) {
rrdset_foreach_done(st);
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
+ if (netdata_cloud_enabled) {
struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
if (likely(wc)) {
wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
@@ -397,14 +433,15 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
// do not send notifications for internal statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
// mark it as run, so that we will send the same alarm if it happens again
goto done;
}
@@ -418,11 +455,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if (likely(ret == 1)) {
// we have executed this alarm notification in the past
- if(last_executed_status == ae->new_status) {
+ if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
// don't send the notification for the same status again
- debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
+ netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
- log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
+ netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -432,7 +469,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// so, don't send CLEAR notifications
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
- debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
+ netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
, ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -442,11 +479,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
@@ -533,7 +570,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->old_value,
ae->source?ae_source(ae):"UNKNOWN",
(uint32_t)ae->duration,
- (uint32_t)ae->non_clear_duration,
+ (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration,
ae_units(ae),
ae_info(ae),
ae_new_value_string(ae),
@@ -546,20 +583,21 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
buffer_tostring(crit_alarms),
ae->classification?ae_classification(ae):"Unknown",
edit_command,
- host != localhost ? host->machine_guid:"");
+ host->machine_guid,
+ &ae->transition_id);
const char *command_to_run = buffer_tostring(wb);
if (ok) {
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
- debug(D_HEALTH, "executing command '%s'", command_to_run);
+ netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
enqueue_alarm_notify_in_progress(ae);
health_alarm_log_save(host, ae);
} else {
- error("Failed to format command arguments");
+ netdata_log_error("Failed to format command arguments");
}
buffer_free(wb);
@@ -578,7 +616,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
return;
spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
- debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
+ netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
if(ae->exec_code != 0)
@@ -588,7 +626,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
}
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
- debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
+ netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
ae->new_value,
rrdcalc_status2string(ae->old_status),
@@ -602,31 +640,29 @@ static inline void health_alarm_log_process(RRDHOST *host) {
uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
time_t now = now_realtime_sec();
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_lock(&host->health_log.spinlock);
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
- if(unlikely(
+ if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
- )) {
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
+ )) {
+ if(unlikely(ae->unique_id < first_waiting))
+ first_waiting = ae->unique_id;
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
- }
+ if(likely(now >= ae->delay_up_to_timestamp))
+ health_process_notifications(host, ae);
}
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
//delete those that are updated, no in progress execution, and is not repeating
- netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_lock(&host->health_log.spinlock);
ALARM_ENTRY *prev = NULL, *next = NULL;
for(ae = host->health_log.alarms; ae ; ae = next) {
@@ -639,7 +675,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
||
((ae->new_status == RRDCALC_STATUS_REMOVED) &&
(ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
- (ae->when + 3600 < now_realtime_sec())))
+ (ae->when + 86400 < now_realtime_sec())))
{
if(host->health_log.alarms == ae) {
@@ -658,12 +694,12 @@ static inline void health_alarm_log_process(RRDHOST *host) {
prev = ae;
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_unlock(&host->health_log.spinlock);
}
static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
if(unlikely(!rc->rrdset)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -674,27 +710,27 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
*next_run = rc->next_update;
}
- debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
+ netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
return 0;
}
if(unlikely(!rc->update_every)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -703,7 +739,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
time_t last = rrdset_last_entry_s(rc->rrdset);
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
- debug(D_HEALTH
+ netdata_log_debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
, rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
, (unsigned long) last);
@@ -714,7 +750,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
time_t needed = now + rc->before + rc->after;
if(needed + update_every < first || needed - update_every > last) {
- debug(D_HEALTH
+ netdata_log_debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
, rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
, (unsigned long) last);
@@ -747,10 +783,10 @@ static void health_main_cleanup(void *ptr) {
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
- info("cleaning up...");
+ netdata_log_info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- log_health("Health thread ended.");
+ netdata_log_health("Health thread ended.");
}
static void initialize_health(RRDHOST *host)
@@ -762,7 +798,7 @@ static void initialize_health(RRDHOST *host)
rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
- log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host));
host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
@@ -775,16 +811,32 @@ static void initialize_health(RRDHOST *host)
long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
if(n < 10) {
- error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
}
else
host->health_log.max = (unsigned int)n;
+ uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
+ if (m < HEALTH_LOG_MINIMUM_HISTORY) {
+ netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+ config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
+ m = HEALTH_LOG_MINIMUM_HISTORY;
+ }
+
+ //default health log history is 5 days and not less than a day
+ if (host->health_log.health_log_history) {
+ if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY)
+ host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY;
+ } else
+ host->health_log.health_log_history = m;
+
+ netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
+
conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
SIMPLE_PATTERN_EXACT, true);
- netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_init(&host->health_log.spinlock);
char filename[FILENAME_MAX + 1];
@@ -794,7 +846,6 @@ static void initialize_health(RRDHOST *host)
// TODO: This needs to go to the metadata thread
// Health should wait before accessing the table (needs to be created by the metadata thread)
- sql_create_health_log_table(host);
sql_health_alarm_log_load(host);
// ------------------------------------------------------------------------
@@ -821,20 +872,20 @@ static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
time_t now = now_realtime_sec();
if(now < next_run) {
worker_is_idle();
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
+ netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
while (now < next_run && service_running(SERVICE_HEALTH)) {
sleep_usec(USEC_PER_SEC);
now = now_realtime_sec();
}
}
else {
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+ netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
}
}
static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
SILENCER *s;
- debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
+ netdata_log_debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
for (s = silencers->silencers; s!=NULL; s=s->next){
@@ -845,11 +896,11 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil
(!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) &&
(!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family)))
) {
- debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
+ netdata_log_debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
if (unlikely(silencers->stype == STYPE_NONE)) {
- debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
} else {
- debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
+ netdata_log_debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, rrdcalc_name(rc)
, (rc->rrdset)?rrdset_context(rc->rrdset):""
@@ -888,7 +939,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
}
if (rrdcalc_flags_old != rc->run_flags) {
- info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
+ netdata_log_info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
rrdhost_hostname(host),
rrdcalc_name(rc),
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
@@ -905,7 +956,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
+ if (netdata_cloud_enabled) {
struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
if (unlikely(!wc)) {
return;
@@ -1001,7 +1052,7 @@ void *health_main(void *ptr) {
while(service_running(SERVICE_HEALTH)) {
loop++;
- debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
+ netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
time_t now = now_realtime_sec();
int runnable = 0, apply_hibernation_delay = 0;
@@ -1012,7 +1063,7 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- log_health(
+ netdata_log_health(
"Postponing alarm checks for %"PRId64" seconds, "
"because it seems that the system was just resumed from suspension.",
(int64_t)hibernation_delay);
@@ -1021,7 +1072,7 @@ void *health_main(void *ptr) {
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
static int logged=0;
if (!logged) {
- log_health("Skipping health checks, because all alarms are disabled via a %s command.",
+ netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.",
HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
@@ -1044,7 +1095,7 @@ void *health_main(void *ptr) {
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
if (unlikely(apply_hibernation_delay)) {
- log_health(
+ netdata_log_health(
"[%s]: Postponing health checks for %"PRId64" seconds.",
rrdhost_hostname(host),
(int64_t)hibernation_delay);
@@ -1057,20 +1108,20 @@ void *health_main(void *ptr) {
continue;
}
- log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
host->health.health_delay_up_to = 0;
}
// wait until cleanup of obsolete charts on children is complete
if (host != localhost) {
if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
- log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
continue;
}
}
if (!health_running_logged) {
- log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host));
health_running_logged = true;
}
@@ -1127,11 +1178,13 @@ void *health_main(void *ptr) {
rc->old_status = rc->status;
rc->status = RRDCALC_STATUS_REMOVED;
rc->last_status_change = now;
+ rc->last_status_change_value = rc->value;
rc->last_updated = now;
rc->value = NAN;
+ rc->ae = ae;
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting)
+ if (netdata_cloud_enabled)
sql_queue_alarm_to_aclk(host, ae, 1);
#endif
}
@@ -1170,7 +1223,7 @@ void *health_main(void *ptr) {
rc->value = NAN;
rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
);
} else
@@ -1181,14 +1234,14 @@ void *health_main(void *ptr) {
rc->value = NAN;
rc->run_flags |= RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH,
+ netdata_log_debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
);
} else
rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
);
}
@@ -1204,14 +1257,14 @@ void *health_main(void *ptr) {
rc->value = NAN;
rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
);
} else {
rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, rc->calculation->result,
@@ -1248,14 +1301,14 @@ void *health_main(void *ptr) {
// calculation failed
rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH,
+ netdata_log_debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
buffer_tostring(rc->warning->error_msg)
);
} else {
rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
@@ -1274,14 +1327,14 @@ void *health_main(void *ptr) {
// calculation failed
rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH,
+ netdata_log_debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
buffer_tostring(rc->critical->error_msg)
);
} else {
rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
@@ -1297,36 +1350,37 @@ void *health_main(void *ptr) {
RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
switch (warning_status) {
- case RRDCALC_STATUS_CLEAR:
- status = RRDCALC_STATUS_CLEAR;
- break;
+ case RRDCALC_STATUS_CLEAR:
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_WARNING;
- break;
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_WARNING;
+ break;
- default:
- break;
+ default:
+ break;
}
switch (critical_status) {
- case RRDCALC_STATUS_CLEAR:
- if (status == RRDCALC_STATUS_UNDEFINED)
- status = RRDCALC_STATUS_CLEAR;
- break;
+ case RRDCALC_STATUS_CLEAR:
+ if (status == RRDCALC_STATUS_UNDEFINED)
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_CRITICAL;
- break;
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_CRITICAL;
+ break;
- default:
- break;
+ default:
+ break;
}
// --------------------------------------------------------
// check if the new status and the old differ
if (status != rc->status) {
+
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
int delay = 0;
@@ -1392,11 +1446,19 @@ void *health_main(void *ptr) {
health_alarm_log_add_entry(host, ae);
- log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ rc->last_status_change_value = rc->value;
rc->last_status_change = now;
rc->old_status = rc->status;
rc->status = status;
+ rc->ae = ae;
+
+ if(unlikely(rrdcalc_isrepeating(rc))) {
+ rc->last_repeat = now;
+ if (rc->status == RRDCALC_STATUS_CLEAR)
+ rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
+ }
}
rc->last_updated = now;
@@ -1437,7 +1499,6 @@ void *health_main(void *ptr) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
-
ALARM_ENTRY *ae = health_create_alarm_entry(
host,
rc->id,
@@ -1475,7 +1536,7 @@ void *health_main(void *ptr) {
}
rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
health_process_notifications(host, ae);
- debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ netdata_log_debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
}
@@ -1503,7 +1564,7 @@ void *health_main(void *ptr) {
break;
}
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
+ if (netdata_cloud_enabled) {
struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
if (unlikely(!wc)) {
continue;