summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/Makefile.am1
-rw-r--r--health/health.c269
-rw-r--r--health/health.d/apcupsd.conf4
-rw-r--r--health/health.d/bcache.conf3
-rw-r--r--health/health.d/beanstalkd.conf3
-rw-r--r--health/health.d/bind_rndc.conf1
-rw-r--r--health/health.d/boinc.conf4
-rw-r--r--health/health.d/cgroups.conf3
-rw-r--r--health/health.d/consul.conf2
-rw-r--r--health/health.d/cpu.conf2
-rw-r--r--health/health.d/dnsmasq_dhcp.conf1
-rw-r--r--health/health.d/docker.conf2
-rw-r--r--health/health.d/elasticsearch.conf2
-rw-r--r--health/health.d/file_descriptors.conf31
-rw-r--r--health/health.d/gearman.conf1
-rw-r--r--health/health.d/geth.conf1
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf2
-rw-r--r--health/health.d/ipmi.conf10
-rw-r--r--health/health.d/linux_power_supply.conf1
-rw-r--r--health/health.d/nut.conf4
-rw-r--r--health/health.d/pihole.conf3
-rw-r--r--health/health.d/ram.conf2
-rw-r--r--health/health.d/scaleio.conf4
-rw-r--r--health/health.d/vcsa.conf3
-rw-r--r--health/health.d/windows.conf40
-rw-r--r--health/health.h38
-rw-r--r--health/health_config.c231
-rw-r--r--health/health_json.c4
-rw-r--r--health/health_log.c19
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in42
31 files changed, 400 insertions, 337 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 0ef55c75e..20e000860 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -44,6 +44,7 @@ dist_healthconfig_DATA = \
health.d/elasticsearch.conf \
health.d/entropy.conf \
health.d/exporting.conf \
+ health.d/file_descriptors.conf \
health.d/geth.conf \
health.d/ioping.conf \
health.d/gearman.conf \
diff --git a/health/health.c b/health/health.c
index e04debb93..eeed3a674 100644
--- a/health/health.c
+++ b/health/health.c
@@ -22,6 +22,35 @@ char *silencers_filename;
SIMPLE_PATTERN *conf_enabled_alarms = NULL;
DICTIONARY *health_rrdvars;
+void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) {
+ buffer_json_member_add_array(wb, key);
+
+ if(flags & HEALTH_ENTRY_FLAG_PROCESSED)
+ buffer_json_add_array_item_string(wb, "PROCESSED");
+ if(flags & HEALTH_ENTRY_FLAG_UPDATED)
+ buffer_json_add_array_item_string(wb, "UPDATED");
+ if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
+ buffer_json_add_array_item_string(wb, "EXEC_RUN");
+ if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)
+ buffer_json_add_array_item_string(wb, "EXEC_FAILED");
+ if(flags & HEALTH_ENTRY_FLAG_SILENCED)
+ buffer_json_add_array_item_string(wb, "SILENCED");
+ if(flags & HEALTH_ENTRY_RUN_ONCE)
+ buffer_json_add_array_item_string(wb, "RUN_ONCE");
+ if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)
+ buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS");
+ if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING)
+ buffer_json_add_array_item_string(wb, "RECURRING");
+ if(flags & HEALTH_ENTRY_FLAG_SAVED)
+ buffer_json_add_array_item_string(wb, "SAVED");
+ if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED)
+ buffer_json_add_array_item_string(wb, "ACLK_QUEUED");
+ if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)
+ buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION");
+
+ buffer_json_array_close(wb);
+}
+
static bool prepare_command(BUFFER *wb,
const char *exec,
const char *recipient,
@@ -52,8 +81,9 @@ static bool prepare_command(BUFFER *wb,
const char *crit_alarms,
const char *classification,
const char *edit_command,
- const char *machine_guid)
-{
+ const char *machine_guid,
+ uuid_t *transition_id
+) {
char buf[8192];
size_t n = 8192 - 1;
@@ -159,6 +189,12 @@ static bool prepare_command(BUFFER *wb,
return false;
buffer_sprintf(wb, " '%s'", buf);
+ char tr_id[UUID_STR_LEN];
+ uuid_unparse_lower(*transition_id, tr_id);
+ if (!sanitize_command_argument_string(buf, tr_id, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
return true;
}
@@ -257,22 +293,22 @@ static void health_silencers_init(void) {
if (copied == (length* sizeof(char))) {
str[length] = 0x00;
json_parse(str, NULL, health_silencers_json_read_callback);
- info("Parsed health silencers file %s", silencers_filename);
+ netdata_log_info("Parsed health silencers file %s", silencers_filename);
} else {
- error("Cannot read the data from health silencers file %s", silencers_filename);
+ netdata_log_error("Cannot read the data from health silencers file %s", silencers_filename);
}
freez(str);
}
} else {
- error(
- "Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
- silencers_filename,
- (int64_t)length,
- HEALTH_SILENCERS_MAX_FILE_LEN);
+ netdata_log_error("Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
+ silencers_filename,
+ (int64_t)length,
+ HEALTH_SILENCERS_MAX_FILE_LEN);
}
fclose(fd);
} else {
- info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
+ netdata_log_info("Cannot open the file %s, so Netdata will work with the default health configuration.",
+ silencers_filename);
}
}
@@ -282,10 +318,10 @@ static void health_silencers_init(void) {
* Initialize the health thread.
*/
void health_init(void) {
- debug(D_HEALTH, "Health configuration initializing");
+ netdata_log_debug(D_HEALTH, "Health configuration initializing");
if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
- debug(D_HEALTH, "Health is disabled.");
+ netdata_log_debug(D_HEALTH, "Health is disabled.");
return;
}
@@ -306,7 +342,7 @@ static void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
- log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Reloading health.", rrdhost_hostname(host));
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
@@ -316,13 +352,13 @@ static void health_reload_host(RRDHOST *host) {
rrdcalctemplate_delete_all(host);
// invalidate all previous entries in the alarm log
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_lock(&host->health_log.spinlock);
ALARM_ENTRY *t;
for(t = host->health_log.alarms ; t ; t = t->next) {
if(t->new_status != RRDCALC_STATUS_REMOVED)
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
// reset all thresholds to all charts
RRDSET *st;
@@ -349,7 +385,7 @@ static void health_reload_host(RRDHOST *host) {
rrdset_foreach_done(st);
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
+ if (netdata_cloud_enabled) {
struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
if (likely(wc)) {
wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS;
@@ -397,14 +433,15 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
// do not send notifications for internal statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
// mark it as run, so that we will send the same alarm if it happens again
goto done;
}
@@ -418,11 +455,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if (likely(ret == 1)) {
// we have executed this alarm notification in the past
- if(last_executed_status == ae->new_status) {
+ if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
// don't send the notification for the same status again
- debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
+ netdata_log_debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
- log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
+ netdata_log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -432,7 +469,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// so, don't send CLEAR notifications
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
- debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
+ netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
, ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -442,11 +479,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
@@ -533,7 +570,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->old_value,
ae->source?ae_source(ae):"UNKNOWN",
(uint32_t)ae->duration,
- (uint32_t)ae->non_clear_duration,
+ (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration,
ae_units(ae),
ae_info(ae),
ae_new_value_string(ae),
@@ -546,20 +583,21 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
buffer_tostring(crit_alarms),
ae->classification?ae_classification(ae):"Unknown",
edit_command,
- host != localhost ? host->machine_guid:"");
+ host->machine_guid,
+ &ae->transition_id);
const char *command_to_run = buffer_tostring(wb);
if (ok) {
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
- debug(D_HEALTH, "executing command '%s'", command_to_run);
+ netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run);
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
enqueue_alarm_notify_in_progress(ae);
health_alarm_log_save(host, ae);
} else {
- error("Failed to format command arguments");
+ netdata_log_error("Failed to format command arguments");
}
buffer_free(wb);
@@ -578,7 +616,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
return;
spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
- debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
+ netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
if(ae->exec_code != 0)
@@ -588,7 +626,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
}
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
- debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
+ netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
ae->new_value,
rrdcalc_status2string(ae->old_status),
@@ -602,31 +640,29 @@ static inline void health_alarm_log_process(RRDHOST *host) {
uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
time_t now = now_realtime_sec();
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_lock(&host->health_log.spinlock);
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
- if(unlikely(
+ if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
- )) {
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
+ )) {
+ if(unlikely(ae->unique_id < first_waiting))
+ first_waiting = ae->unique_id;
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
- }
+ if(likely(now >= ae->delay_up_to_timestamp))
+ health_process_notifications(host, ae);
}
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
//delete those that are updated, no in progress execution, and is not repeating
- netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_lock(&host->health_log.spinlock);
ALARM_ENTRY *prev = NULL, *next = NULL;
for(ae = host->health_log.alarms; ae ; ae = next) {
@@ -639,7 +675,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
||
((ae->new_status == RRDCALC_STATUS_REMOVED) &&
(ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
- (ae->when + 3600 < now_realtime_sec())))
+ (ae->when + 86400 < now_realtime_sec())))
{
if(host->health_log.alarms == ae) {
@@ -658,12 +694,12 @@ static inline void health_alarm_log_process(RRDHOST *host) {
prev = ae;
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_unlock(&host->health_log.spinlock);
}
static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
if(unlikely(!rc->rrdset)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -674,27 +710,27 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
*next_run = rc->next_update;
}
- debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
+ netdata_log_debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
return 0;
}
if(unlikely(!rc->update_every)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -703,7 +739,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
time_t last = rrdset_last_entry_s(rc->rrdset);
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
- debug(D_HEALTH
+ netdata_log_debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
, rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
, (unsigned long) last);
@@ -714,7 +750,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
time_t needed = now + rc->before + rc->after;
if(needed + update_every < first || needed - update_every > last) {
- debug(D_HEALTH
+ netdata_log_debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
, rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
, (unsigned long) last);
@@ -747,10 +783,10 @@ static void health_main_cleanup(void *ptr) {
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
- info("cleaning up...");
+ netdata_log_info("cleaning up...");
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
- log_health("Health thread ended.");
+ netdata_log_health("Health thread ended.");
}
static void initialize_health(RRDHOST *host)
@@ -762,7 +798,7 @@ static void initialize_health(RRDHOST *host)
rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
- log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Initializing health.", rrdhost_hostname(host));
host->health.health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
host->health.health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
@@ -775,16 +811,32 @@ static void initialize_health(RRDHOST *host)
long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
if(n < 10) {
- error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ netdata_log_health("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
}
else
host->health_log.max = (unsigned int)n;
+ uint32_t m = config_get_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_DEFAULT_HISTORY);
+ if (m < HEALTH_LOG_MINIMUM_HISTORY) {
+ netdata_log_health("Host '%s': health configuration has invalid health log history %u. Using minimum %d", rrdhost_hostname(host), m, HEALTH_LOG_MINIMUM_HISTORY);
+ config_set_number(CONFIG_SECTION_HEALTH, "health log history", HEALTH_LOG_MINIMUM_HISTORY);
+ m = HEALTH_LOG_MINIMUM_HISTORY;
+ }
+
+ //default health log history is 5 days and not less than a day
+ if (host->health_log.health_log_history) {
+ if (host->health_log.health_log_history < HEALTH_LOG_MINIMUM_HISTORY)
+ host->health_log.health_log_history = HEALTH_LOG_MINIMUM_HISTORY;
+ } else
+ host->health_log.health_log_history = m;
+
+ netdata_log_health("[%s]: Health log history is set to %u seconds (%u days)", rrdhost_hostname(host), host->health_log.health_log_history, host->health_log.health_log_history / 86400);
+
conf_enabled_alarms = simple_pattern_create(config_get(CONFIG_SECTION_HEALTH, "enabled alarms", "*"), NULL,
SIMPLE_PATTERN_EXACT, true);
- netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_init(&host->health_log.spinlock);
char filename[FILENAME_MAX + 1];
@@ -794,7 +846,6 @@ static void initialize_health(RRDHOST *host)
// TODO: This needs to go to the metadata thread
// Health should wait before accessing the table (needs to be created by the metadata thread)
- sql_create_health_log_table(host);
sql_health_alarm_log_load(host);
// ------------------------------------------------------------------------
@@ -821,20 +872,20 @@ static void health_sleep(time_t next_run, unsigned int loop __maybe_unused) {
time_t now = now_realtime_sec();
if(now < next_run) {
worker_is_idle();
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
+ netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
while (now < next_run && service_running(SERVICE_HEALTH)) {
sleep_usec(USEC_PER_SEC);
now = now_realtime_sec();
}
}
else {
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+ netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
}
}
static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
SILENCER *s;
- debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
+ netdata_log_debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
for (s = silencers->silencers; s!=NULL; s=s->next){
@@ -845,11 +896,11 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *sil
(!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches_string(s->charts_pattern, rc->chart))) &&
(!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches_string(s->families_pattern, rc->rrdset->family)))
) {
- debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
+ netdata_log_debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
if (unlikely(silencers->stype == STYPE_NONE)) {
- debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
+ netdata_log_debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
} else {
- debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
+ netdata_log_debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, rrdcalc_name(rc)
, (rc->rrdset)?rrdset_context(rc->rrdset):""
@@ -888,7 +939,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
}
if (rrdcalc_flags_old != rc->run_flags) {
- info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
+ netdata_log_info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
rrdhost_hostname(host),
rrdcalc_name(rc),
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
@@ -905,7 +956,7 @@ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
static void sql_health_postpone_queue_removed(RRDHOST *host __maybe_unused) {
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
+ if (netdata_cloud_enabled) {
struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
if (unlikely(!wc)) {
return;
@@ -1001,7 +1052,7 @@ void *health_main(void *ptr) {
while(service_running(SERVICE_HEALTH)) {
loop++;
- debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
+ netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
time_t now = now_realtime_sec();
int runnable = 0, apply_hibernation_delay = 0;
@@ -1012,7 +1063,7 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- log_health(
+ netdata_log_health(
"Postponing alarm checks for %"PRId64" seconds, "
"because it seems that the system was just resumed from suspension.",
(int64_t)hibernation_delay);
@@ -1021,7 +1072,7 @@ void *health_main(void *ptr) {
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
static int logged=0;
if (!logged) {
- log_health("Skipping health checks, because all alarms are disabled via a %s command.",
+ netdata_log_health("Skipping health checks, because all alarms are disabled via a %s command.",
HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
@@ -1044,7 +1095,7 @@ void *health_main(void *ptr) {
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
if (unlikely(apply_hibernation_delay)) {
- log_health(
+ netdata_log_health(
"[%s]: Postponing health checks for %"PRId64" seconds.",
rrdhost_hostname(host),
(int64_t)hibernation_delay);
@@ -1057,20 +1108,20 @@ void *health_main(void *ptr) {
continue;
}
- log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
host->health.health_delay_up_to = 0;
}
// wait until cleanup of obsolete charts on children is complete
if (host != localhost) {
if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
- log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
continue;
}
}
if (!health_running_logged) {
- log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Health is running.", rrdhost_hostname(host));
health_running_logged = true;
}
@@ -1127,11 +1178,13 @@ void *health_main(void *ptr) {
rc->old_status = rc->status;
rc->status = RRDCALC_STATUS_REMOVED;
rc->last_status_change = now;
+ rc->last_status_change_value = rc->value;
rc->last_updated = now;
rc->value = NAN;
+ rc->ae = ae;
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting)
+ if (netdata_cloud_enabled)
sql_queue_alarm_to_aclk(host, ae, 1);
#endif
}
@@ -1170,7 +1223,7 @@ void *health_main(void *ptr) {
rc->value = NAN;
rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
);
} else
@@ -1181,14 +1234,14 @@ void *health_main(void *ptr) {
rc->value = NAN;
rc->run_flags |= RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH,
+ netdata_log_debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
);
} else
rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
);
}
@@ -1204,14 +1257,14 @@ void *health_main(void *ptr) {
rc->value = NAN;
rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
);
} else {
rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
rc->calculation->parsed_as, rc->calculation->result,
@@ -1248,14 +1301,14 @@ void *health_main(void *ptr) {
// calculation failed
rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH,
+ netdata_log_debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
buffer_tostring(rc->warning->error_msg)
);
} else {
rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
@@ -1274,14 +1327,14 @@ void *health_main(void *ptr) {
// calculation failed
rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH,
+ netdata_log_debug(D_HEALTH,
"Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
buffer_tostring(rc->critical->error_msg)
);
} else {
rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
+ netdata_log_debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
NETDATA_DOUBLE_FORMAT
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
@@ -1297,36 +1350,37 @@ void *health_main(void *ptr) {
RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
switch (warning_status) {
- case RRDCALC_STATUS_CLEAR:
- status = RRDCALC_STATUS_CLEAR;
- break;
+ case RRDCALC_STATUS_CLEAR:
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_WARNING;
- break;
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_WARNING;
+ break;
- default:
- break;
+ default:
+ break;
}
switch (critical_status) {
- case RRDCALC_STATUS_CLEAR:
- if (status == RRDCALC_STATUS_UNDEFINED)
- status = RRDCALC_STATUS_CLEAR;
- break;
+ case RRDCALC_STATUS_CLEAR:
+ if (status == RRDCALC_STATUS_UNDEFINED)
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_CRITICAL;
- break;
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_CRITICAL;
+ break;
- default:
- break;
+ default:
+ break;
}
// --------------------------------------------------------
// check if the new status and the old differ
if (status != rc->status) {
+
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
int delay = 0;
@@ -1392,11 +1446,19 @@ void *health_main(void *ptr) {
health_alarm_log_add_entry(host, ae);
- log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ netdata_log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+ rc->last_status_change_value = rc->value;
rc->last_status_change = now;
rc->old_status = rc->status;
rc->status = status;
+ rc->ae = ae;
+
+ if(unlikely(rrdcalc_isrepeating(rc))) {
+ rc->last_repeat = now;
+ if (rc->status == RRDCALC_STATUS_CLEAR)
+ rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
+ }
}
rc->last_updated = now;
@@ -1437,7 +1499,6 @@ void *health_main(void *ptr) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
-
ALARM_ENTRY *ae = health_create_alarm_entry(
host,
rc->id,
@@ -1475,7 +1536,7 @@ void *health_main(void *ptr) {
}
rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
health_process_notifications(host, ae);
- debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ netdata_log_debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
}
@@ -1503,7 +1564,7 @@ void *health_main(void *ptr) {
break;
}
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
+ if (netdata_cloud_enabled) {
struct aclk_sync_host_config *wc = (struct aclk_sync_host_config *)host->aclk_sync_host_config;
if (unlikely(!wc)) {
continue;
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 65f1a69ab..7a0afcd18 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -11,7 +11,6 @@ component: UPS
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 10m multiplier 1.5 max 1h
info: average UPS load over the last 10 minutes
to: sitemgr
@@ -29,7 +28,7 @@ component: UPS
units: %
every: 60s
warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ crit: $this < 40
delay: down 10m multiplier 1.5 max 1h
info: average UPS charge over the last minute
to: sitemgr
@@ -43,7 +42,6 @@ component: UPS device
every: 10s
units: seconds ago
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 49cb5ad0f..3f92e80df 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -22,8 +22,7 @@ component: Disk
calc: $dirty + $metadata + $undefined
units: %
every: 1m
- warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+ warn: $this > 75
delay: up 1m down 1h multiplier 1.5 max 2h
info: percentage of cache space used for dirty data and metadata \
(this usually means your SSD cache is too small)
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 13ac8c182..4ee8bc0bd 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -8,8 +8,7 @@ component: Beanstalk
calc: $buried
units: jobs
every: 10s
- warn: $this > 0
- crit: $this > 10
+ warn: $this > 3
delay: up 0 down 5m multiplier 1.2 max 1h
info: number of buried jobs across all tubes. \
You need to manually kick them so they can be processed. \
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 7c09225ff..b3e75a239 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -7,6 +7,5 @@ component: BIND
every: 60
calc: $stats_size
warn: $this > 512
- crit: $this > 1024
info: BIND statistics-file size
to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 6f37787d7..b7dcbe316 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this > 0
- crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
info: average number of compute errors over the last 10 minutes
to: sysadmin
@@ -29,7 +28,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this > 0
- crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
info: average number of failed uploads over the last 10 minutes
to: sysadmin
@@ -46,7 +44,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this < 1
- crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
info: average number of total tasks over the last 10 minutes
to: sysadmin
@@ -64,7 +61,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this < 1
- crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
info: average number of active tasks over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 08260ff6d..f625e5455 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -64,7 +64,6 @@ component: Network
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
@@ -83,7 +82,6 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
to: sysadmin
@@ -134,7 +132,6 @@ component: Network
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
index dff6d2df3..7edca6563 100644
--- a/health/health.d/consul.conf
+++ b/health/health.d/consul.conf
@@ -10,7 +10,7 @@ component: Consul
units: seconds
warn: $this < 14*24*60*60
crit: $this < 7*24*60*60
- info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
+ info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_autopilot_health_status
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index ad6952825..907d6ff8a 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -28,7 +28,6 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (20) : (40))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
delay: down 15m multiplier 1.5 max 1h
info: average CPU iowait time over the last 10 minutes
to: sysadmin
@@ -44,7 +43,6 @@ component: CPU
units: %
every: 5m
warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
delay: down 1h multiplier 1.5 max 2h
info: average CPU steal time over the last 20 minutes
to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 010b94599..81d37df64 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -9,7 +9,6 @@ component: Dnsmasq
units: %
calc: $used
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: down 5m
info: DHCP range utilization
to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
index f17028472..01919dc0d 100644
--- a/health/health.d/docker.conf
+++ b/health/health.d/docker.conf
@@ -6,6 +6,6 @@ component: Docker
units: status
every: 10s
lookup: average -10s of unhealthy
- crit: $this > 0
+ warn: $this > 0
info: ${label:container_name} docker container health status is unhealthy
to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index 47f8e1eb9..29f1e9b27 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -10,7 +10,7 @@ component: Elasticsearch
lookup: average -5s unaligned of *ed
every: 10s
units: status
- warn: $this == 1
+ crit: $this == 1
delay: down 5m multiplier 1.5 max 1h
info: cluster health status is red.
to: sysadmin
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
new file mode 100644
index 000000000..d136ea517
--- /dev/null
+++ b/health/health.d/file_descriptors.conf
@@ -0,0 +1,31 @@
+ # you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: system_file_descriptors_utilization
+ on: system.file_nr_utilization
+ class: Utilization
+ type: System
+ component: Processes
+ hosts: *
+ lookup: max -1m unaligned
+ units: %
+ every: 1m
+ crit: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ info: system-wide utilization of open files
+ to: sysadmin
+
+ template: apps_group_file_descriptors_utilization
+ on: apps.fd_limit
+ class: Utilization
+ type: System
+component: Process
+ os: linux
+ module: !* *
+ hosts: *
+ lookup: max -1m unaligned foreach *
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: maximum utilization of open files among all application group PIDs
+ to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index 14010d445..580d114f8 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -8,7 +8,6 @@ component: Gearman
units: workers
every: 10s
warn: $this > 30000
- crit: $this > 100000
delay: down 5m multiplier 1.5 max 1h
info: average number of queued jobs over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
index dd1eb4701..361b6b41f 100644
--- a/health/health.d/geth.conf
+++ b/health/health.d/geth.conf
@@ -8,5 +8,4 @@ component: geth
calc: $chain_head_block - $chain_head_header
units: blocks
warn: $this != 0
- crit: $this > 5
delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 2786cbd62..47ac4453c 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -6,10 +6,8 @@ component: Disk
lookup: average -10s unaligned of latency
units: microseconds
every: 10s
- green: 5000
- red: 10000
+ green: 10000
warn: $this > $green
- crit: $this > $red
delay: down 30m multiplier 1.5 max 2h
info: average I/O latency over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index c178a410a..3d1b46c02 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -12,7 +12,6 @@ component: IPC
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
info: IPC semaphore utilization
to: sysadmin
@@ -28,7 +27,6 @@ component: IPC
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
info: IPC semaphore arrays utilization
to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index feadba1b7..4d6478cca 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -1,15 +1,15 @@
- alarm: ipmi_sensors_states
- on: ipmi.sensors_states
+ template: ipmi_sensor_state
+ on: ipmi.sensor_state
class: Errors
type: System
component: IPMI
calc: $warning + $critical
- units: sensors
+ units: state
every: 10s
- warn: $this > 0
+ warn: $warning > 0
crit: $critical > 0
delay: up 5m down 15m multiplier 1.5 max 1h
- info: number of IPMI sensors in non-nominal state
+ info: IPMI sensor ${label:sensor} (${label:component}) state
to: sysadmin
alarm: ipmi_events
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index c0bc6de8a..4562122ca 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -9,7 +9,6 @@ component: Battery
units: %
every: 10s
warn: $this < 10
- crit: $this < 5
delay: up 30s down 5m multiplier 1.2 max 1h
info: percentage of remaining power supply capacity
to: sysadmin
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
index 6231dd97b..67843205c 100644
--- a/health/health.d/nut.conf
+++ b/health/health.d/nut.conf
@@ -26,8 +26,8 @@ component: UPS
lookup: average -60s unaligned of battery_charge
units: %
every: 60s
- warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ warn: $this < 75
+ crit: $this < 40
delay: down 10m multiplier 1.5 max 1h
info: average UPS charge over the last minute
to: sitemgr
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index ee6c57cc5..045930ae5 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -10,8 +10,7 @@ component: Pi-hole
every: 10s
units: seconds
calc: $ago
- warn: $this > 60 * 60 * 24 * 8
- crit: $this > 60 * 60 * 24 * 8 * 2
+ warn: $this > 60 * 60 * 24 * 30
info: gravity.list (blocklist) file last update time
to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ab382c43b..34e5431a8 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -28,7 +28,6 @@ component: Memory
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
@@ -74,7 +73,6 @@ component: Memory
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index ab110bf07..27a857fcd 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -9,8 +9,8 @@ component: ScaleIO
calc: $used
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
delay: down 15m multiplier 1.5 max 1h
info: storage pool capacity utilization
to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index a9cc7ceef..bff34cd39 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -133,8 +133,7 @@ component: VMware vCenter
lookup: max -10s unaligned of software_packages
units: status
every: 10s
- warn: $this == 4
- crit: $this == 3
+ warn: ($this == 3) || ($this == 4)
delay: down 1m multiplier 1.5 max 1h
info: software updates availability status \
(-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index d4bc7639c..28a886386 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -6,7 +6,7 @@
class: Utilization
type: Windows
component: CPU
- os: linux
+ os: *
hosts: *
lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
units: %
@@ -25,7 +25,7 @@ component: CPU
class: Utilization
type: Windows
component: Memory
- os: linux
+ os: *
hosts: *
calc: ($used) * 100 / ($used + $available)
units: %
@@ -36,31 +36,15 @@ component: Memory
info: memory utilization
to: sysadmin
- template: windows_swap_in_use
- on: windows.memory_swap_utilization
- class: Utilization
- type: Windows
-component: Memory
- os: linux
- hosts: *
- calc: ($used) * 100 / ($used + $available)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- info: swap memory utilization
- to: sysadmin
-
## Network
template: windows_inbound_packets_discarded
- on: windows.net_discarded
+ on: windows.net_nic_discarded
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of inbound
units: packets
@@ -71,11 +55,11 @@ component: Network
to: sysadmin
template: windows_outbound_packets_discarded
- on: windows.net_discarded
+ on: windows.net_nic_discarded
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of outbound
units: packets
@@ -86,11 +70,11 @@ component: Network
to: sysadmin
template: windows_inbound_packets_errors
- on: windows.net_errors
+ on: windows.net_nic_errors
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of inbound
units: packets
@@ -101,11 +85,11 @@ component: Network
to: sysadmin
template: windows_outbound_packets_errors
- on: windows.net_errors
+ on: windows.net_nic_errors
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of outbound
units: packets
@@ -119,11 +103,11 @@ component: Network
## Disk
template: windows_disk_in_use
- on: windows.logical_disk_utilization
+ on: windows.logical_disk_space_usage
class: Utilization
type: Windows
component: Disk
- os: linux
+ os: *
hosts: *
calc: ($used) * 100 / ($used + $free)
units: %
diff --git a/health/health.h b/health/health.h
index c36aabac7..543bc56a1 100644
--- a/health/health.h
+++ b/health/health.h
@@ -7,18 +7,21 @@
extern unsigned int default_health_enabled;
-#define HEALTH_ENTRY_FLAG_PROCESSED 0x00000001
-#define HEALTH_ENTRY_FLAG_UPDATED 0x00000002
-#define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004
-#define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008
-#define HEALTH_ENTRY_FLAG_SILENCED 0x00000010
-#define HEALTH_ENTRY_RUN_ONCE 0x00000020
-#define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040
-#define HEALTH_ENTRY_FLAG_IS_REPEATING 0x00000080
-
-#define HEALTH_ENTRY_FLAG_SAVED 0x10000000
-#define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000
-#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
+typedef enum __attribute__((packed)) {
+ HEALTH_ENTRY_FLAG_PROCESSED = 0x00000001, // notifications engine has processed this
+ HEALTH_ENTRY_FLAG_UPDATED = 0x00000002, // there is a more recent update about this transition
+ HEALTH_ENTRY_FLAG_EXEC_RUN = 0x00000004, // notification script has been run (this is the intent, not the result)
+ HEALTH_ENTRY_FLAG_EXEC_FAILED = 0x00000008, // notification script couldn't be run
+ HEALTH_ENTRY_FLAG_SILENCED = 0x00000010,
+ HEALTH_ENTRY_RUN_ONCE = 0x00000020,
+ HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS = 0x00000040,
+ HEALTH_ENTRY_FLAG_IS_REPEATING = 0x00000080,
+ HEALTH_ENTRY_FLAG_SAVED = 0x10000000, // Saved to SQL
+ HEALTH_ENTRY_FLAG_ACLK_QUEUED = 0x20000000, // Sent to Netdata Cloud
+ HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION = 0x80000000,
+} HEALTH_ENTRY_FLAGS;
+
+void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags);
#ifndef HEALTH_LISTEN_PORT
#define HEALTH_LISTEN_PORT 19998
@@ -28,6 +31,14 @@ extern unsigned int default_health_enabled;
#define HEALTH_LISTEN_BACKLOG 4096
#endif
+#ifndef HEALTH_LOG_DEFAULT_HISTORY
+#define HEALTH_LOG_DEFAULT_HISTORY 432000
+#endif
+
+#ifndef HEALTH_LOG_MINIMUM_HISTORY
+#define HEALTH_LOG_MINIMUM_HISTORY 86400
+#endif
+
#define HEALTH_SILENCERS_MAX_FILE_LEN 10000
extern char *silencers_filename;
@@ -40,6 +51,7 @@ void health_reload(void);
void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
+void health_alert2json_conf(RRDHOST *host, BUFFER *wb, CONTEXTS_V2_OPTIONS all);
void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);
@@ -73,7 +85,7 @@ ALARM_ENTRY* health_create_alarm_entry(
STRING *units,
STRING *info,
int delay,
- uint32_t flags);
+ HEALTH_ENTRY_FLAGS flags);
void health_alarm_log_add_entry(RRDHOST *host, ALARM_ENTRY *ae);
diff --git a/health/health_config.c b/health/health_config.c
index a11fd51cd..4e93235e2 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -61,36 +61,36 @@ static inline int health_parse_delay(
if(!strcasecmp(key, "up")) {
if (!config_parse_duration(value, delay_up_duration)) {
- error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
- line, filename, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, filename, value, key);
}
else given_up = 1;
}
else if(!strcasecmp(key, "down")) {
if (!config_parse_duration(value, delay_down_duration)) {
- error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
- line, filename, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, filename, value, key);
}
else given_down = 1;
}
else if(!strcasecmp(key, "multiplier")) {
*delay_multiplier = strtof(value, NULL);
if(isnan(*delay_multiplier) || isinf(*delay_multiplier) || islessequal(*delay_multiplier, 0)) {
- error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
- line, filename, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, filename, value, key);
}
else given_multiplier = 1;
}
else if(!strcasecmp(key, "max")) {
if (!config_parse_duration(value, delay_max_duration)) {
- error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
- line, filename, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, filename, value, key);
}
else given_max = 1;
}
else {
- error("Health configuration at line %zu of file '%s': unknown keyword '%s'",
- line, filename, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': unknown keyword '%s'",
+ line, filename, key);
}
}
@@ -136,7 +136,7 @@ static inline uint32_t health_parse_options(const char *s) {
if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear"))
options |= RRDCALC_OPTION_NO_CLEAR_NOTIFICATION;
else
- error("Ignoring unknown alarm option '%s'", buf);
+ netdata_log_error("Ignoring unknown alarm option '%s'", buf);
}
}
@@ -171,14 +171,14 @@ static inline int health_parse_repeat(
}
if(!strcasecmp(key, "warning")) {
if (!config_parse_duration(value, (int*)warn_repeat_every)) {
- error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
- line, file, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, file, value, key);
}
}
else if(!strcasecmp(key, "critical")) {
if (!config_parse_duration(value, (int*)crit_repeat_every)) {
- error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
- line, file, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, file, value, key);
}
}
}
@@ -308,7 +308,7 @@ static inline int health_parse_db_lookup(
RRDR_TIME_GROUPING *group_method, int *after, int *before, int *every,
RRDCALC_OPTIONS *options, STRING **dimensions, STRING **foreachdim
) {
- debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string);
+ netdata_log_debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string);
if(*dimensions) string_freez(*dimensions);
if(*foreachdim) string_freez(*foreachdim);
@@ -326,14 +326,14 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
if(!*s) {
- error("Health configuration invalid chart calculation at line %zu of file '%s': expected group method followed by the 'after' time, but got '%s'",
- line, filename, key);
+ netdata_log_error("Health configuration invalid chart calculation at line %zu of file '%s': expected group method followed by the 'after' time, but got '%s'",
+ line, filename, key);
return 0;
}
if((*group_method = time_grouping_parse(key, RRDR_GROUPING_UNDEFINED)) == RRDR_GROUPING_UNDEFINED) {
- error("Health configuration at line %zu of file '%s': invalid group method '%s'",
- line, filename, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid group method '%s'",
+ line, filename, key);
return 0;
}
@@ -343,8 +343,8 @@ static inline int health_parse_db_lookup(
while(*s && isspace(*s)) *s++ = '\0';
if(!config_parse_duration(key, after)) {
- error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
- line, filename, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
+ line, filename, key);
return 0;
}
@@ -364,8 +364,8 @@ static inline int health_parse_db_lookup(
while(*s && isspace(*s)) *s++ = '\0';
if (!config_parse_duration(value, before)) {
- error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
- line, filename, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
+ line, filename, value, key);
}
}
else if(!strcasecmp(key, HEALTH_EVERY_KEY)) {
@@ -374,8 +374,8 @@ static inline int health_parse_db_lookup(
while(*s && isspace(*s)) *s++ = '\0';
if (!config_parse_duration(value, every)) {
- error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
- line, filename, value, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
+ line, filename, value, key);
}
}
else if(!strcasecmp(key, "absolute") || !strcasecmp(key, "abs") || !strcasecmp(key, "absolute_sum")) {
@@ -422,8 +422,8 @@ static inline int health_parse_db_lookup(
break;
}
else {
- error("Health configuration at line %zu of file '%s': unknown keyword '%s'",
- line, filename, key);
+ netdata_log_error("Health configuration at line %zu of file '%s': unknown keyword '%s'",
+ line, filename, key);
}
}
@@ -499,6 +499,7 @@ static inline void alert_config_free(struct alert_config *cfg)
string_freez(cfg->p_db_lookup_dimensions);
string_freez(cfg->p_db_lookup_method);
string_freez(cfg->chart_labels);
+ string_freez(cfg->source);
freez(cfg);
}
@@ -506,7 +507,7 @@ int sql_store_hashes = 1;
static int health_readfile(const char *filename, void *data) {
RRDHOST *host = (RRDHOST *)data;
- debug(D_HEALTH, "Health configuration reading file '%s'", filename);
+ netdata_log_debug(D_HEALTH, "Health configuration reading file '%s'", filename);
static uint32_t
hash_alarm = 0,
@@ -573,7 +574,7 @@ static int health_readfile(const char *filename, void *data) {
FILE *fp = fopen(filename, "r");
if(!fp) {
- error("Health configuration cannot read file '%s'.", filename);
+ netdata_log_error("Health configuration cannot read file '%s'.", filename);
return 0;
}
@@ -597,7 +598,8 @@ static int health_readfile(const char *filename, void *data) {
if(append < HEALTH_CONF_MAX_LINE)
continue;
else {
- error("Health configuration has too long multi-line at line %zu of file '%s'.", line, filename);
+ netdata_log_error("Health configuration has too long multi-line at line %zu of file '%s'.",
+ line, filename);
}
}
append = 0;
@@ -605,7 +607,8 @@ static int health_readfile(const char *filename, void *data) {
char *key = s;
while(*s && *s != ':') s++;
if(!*s) {
- error("Health configuration has invalid line %zu of file '%s'. It does not contain a ':'. Ignoring it.", line, filename);
+ netdata_log_error("Health configuration has invalid line %zu of file '%s'. It does not contain a ':'. Ignoring it.",
+ line, filename);
continue;
}
*s = '\0';
@@ -616,12 +619,14 @@ static int health_readfile(const char *filename, void *data) {
value = trim_all(value);
if(!key) {
- error("Health configuration has invalid line %zu of file '%s'. Keyword is empty. Ignoring it.", line, filename);
+ netdata_log_error("Health configuration has invalid line %zu of file '%s'. Keyword is empty. Ignoring it.",
+ line, filename);
continue;
}
if(!value) {
- error("Health configuration has invalid line %zu of file '%s'. value is empty. Ignoring it.", line, filename);
+ netdata_log_error("Health configuration has invalid line %zu of file '%s'. value is empty. Ignoring it.",
+ line, filename);
continue;
}
@@ -653,7 +658,7 @@ static int health_readfile(const char *filename, void *data) {
{
char *tmp = strdupz(value);
if(rrdvar_fix_name(tmp))
- error("Health configuration renamed alarm '%s' to '%s'", value, tmp);
+ netdata_log_error("Health configuration renamed alarm '%s' to '%s'", value, tmp);
rc->name = string_strdupz(tmp);
freez(tmp);
@@ -673,6 +678,7 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg = callocz(1, sizeof(struct alert_config));
alert_cfg->alarm = string_dup(rc->name);
+ alert_cfg->source = health_source_file(line, filename);
ignore_this = 0;
} else {
rc = NULL;
@@ -702,7 +708,7 @@ static int health_readfile(const char *filename, void *data) {
{
char *tmp = strdupz(value);
if(rrdvar_fix_name(tmp))
- error("Health configuration renamed template '%s' to '%s'", value, tmp);
+ netdata_log_error("Health configuration renamed template '%s' to '%s'", value, tmp);
rt->name = string_strdupz(tmp);
freez(tmp);
@@ -719,6 +725,7 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg = callocz(1, sizeof(struct alert_config));
alert_cfg->template_key = string_dup(rt->name);
+ alert_cfg->source = health_source_file(line, filename);
ignore_this = 0;
} else {
rt = NULL;
@@ -731,10 +738,10 @@ static int health_readfile(const char *filename, void *data) {
if(!simple_pattern_matches_string(os_pattern, host->os)) {
if(rc)
- debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, os_match);
+ netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, os_match);
if(rt)
- debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, os_match);
+ netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, os_match);
ignore_this = 1;
}
@@ -748,10 +755,10 @@ static int health_readfile(const char *filename, void *data) {
if(!simple_pattern_matches_string(host_pattern, host->hostname)) {
if(rc)
- debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, host_match);
+ netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, host_match);
if(rt)
- debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, host_match);
+ netdata_log_debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, host_match);
ignore_this = 1;
}
@@ -763,8 +770,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->on = string_strdupz(value);
if(rc->chart) {
if(strcmp(rrdcalc_chart_name(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_chart_name(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_chart_name(rc), value, value);
string_freez(rc->chart);
}
@@ -776,8 +783,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->classification = string_strdupz(value);
if(rc->classification) {
if(strcmp(rrdcalc_classification(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_classification(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_classification(rc), value, value);
string_freez(rc->classification);
}
@@ -789,7 +796,7 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->component = string_strdupz(value);
if(rc->component) {
if(strcmp(rrdcalc_component(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
line, filename, rrdcalc_name(rc), key, rrdcalc_component(rc), value, value);
string_freez(rc->component);
@@ -802,8 +809,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->type = string_strdupz(value);
if(rc->type) {
if(strcmp(rrdcalc_type(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_type(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_type(rc), value, value);
string_freez(rc->type);
}
@@ -831,8 +838,8 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
alert_cfg->every = string_strdupz(value);
if(!config_parse_duration(value, &rc->update_every))
- error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
- line, filename, rrdcalc_name(rc), key, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
+ line, filename, rrdcalc_name(rc), key, value);
alert_cfg->p_update_every = rc->update_every;
}
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
@@ -840,8 +847,8 @@ static int health_readfile(const char *filename, void *data) {
char *e;
rc->green = str2ndd(value, &e);
if(e && *e) {
- error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rrdcalc_name(rc), key, e);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
+ line, filename, rrdcalc_name(rc), key, e);
}
}
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
@@ -849,8 +856,8 @@ static int health_readfile(const char *filename, void *data) {
char *e;
rc->red = str2ndd(value, &e);
if(e && *e) {
- error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rrdcalc_name(rc), key, e);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
+ line, filename, rrdcalc_name(rc), key, e);
}
}
else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
@@ -859,8 +866,8 @@ static int health_readfile(const char *filename, void *data) {
int error = 0;
rc->calculation = expression_parse(value, &failed_at, &error);
if(!rc->calculation) {
- error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
+ line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
}
parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE);
}
@@ -870,8 +877,8 @@ static int health_readfile(const char *filename, void *data) {
int error = 0;
rc->warning = expression_parse(value, &failed_at, &error);
if(!rc->warning) {
- error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
+ line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
}
parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE);
}
@@ -881,8 +888,8 @@ static int health_readfile(const char *filename, void *data) {
int error = 0;
rc->critical = expression_parse(value, &failed_at, &error);
if(!rc->critical) {
- error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
+ line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
}
parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE);
}
@@ -890,8 +897,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->exec = string_strdupz(value);
if(rc->exec) {
if(strcmp(rrdcalc_exec(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_exec(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_exec(rc), value, value);
string_freez(rc->exec);
}
@@ -901,8 +908,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->to = string_strdupz(value);
if(rc->recipient) {
if(strcmp(rrdcalc_recipient(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_recipient(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_recipient(rc), value, value);
string_freez(rc->recipient);
}
@@ -914,8 +921,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->units = string_strdupz(value);
if(rc->units) {
if(strcmp(rrdcalc_units(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_units(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_units(rc), value, value);
string_freez(rc->units);
}
@@ -927,8 +934,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->info = string_strdupz(value);
if(rc->info) {
if(strcmp(rrdcalc_info(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalc_name(rc), key, rrdcalc_info(rc), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalc_name(rc), key, rrdcalc_info(rc), value, value);
string_freez(rc->info);
string_freez(rc->original_info);
@@ -954,8 +961,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->host_labels = string_strdupz(value);
if(rc->host_labels) {
if(strcmp(rrdcalc_host_labels(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
- line, filename, rrdcalc_name(rc), key, value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
+ line, filename, rrdcalc_name(rc), key, value, value);
string_freez(rc->host_labels);
simple_pattern_free(rc->host_labels_pattern);
@@ -989,8 +996,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->chart_labels = string_strdupz(value);
if(rc->chart_labels) {
if(strcmp(rrdcalc_chart_labels(rc), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
- line, filename, rrdcalc_name(rc), key, value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
+ line, filename, rrdcalc_name(rc), key, value, value);
string_freez(rc->chart_labels);
simple_pattern_free(rc->chart_labels_pattern);
@@ -1007,8 +1014,8 @@ static int health_readfile(const char *filename, void *data) {
true);
}
else {
- error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
- line, filename, rrdcalc_name(rc), key);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
+ line, filename, rrdcalc_name(rc), key);
}
}
else if(rt) {
@@ -1016,8 +1023,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->on = string_strdupz(value);
if(rt->context) {
if(strcmp(string2str(rt->context), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, string2str(rt->context), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, string2str(rt->context), value, value);
string_freez(rt->context);
}
@@ -1029,8 +1036,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->classification = string_strdupz(value);
if(rt->classification) {
if(strcmp(rrdcalctemplate_classification(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_classification(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_classification(rt), value, value);
string_freez(rt->classification);
}
@@ -1042,8 +1049,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->component = string_strdupz(value);
if(rt->component) {
if(strcmp(rrdcalctemplate_component(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_component(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_component(rt), value, value);
string_freez(rt->component);
}
@@ -1055,8 +1062,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->type = string_strdupz(value);
if(rt->type) {
if(strcmp(rrdcalctemplate_type(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_type(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_type(rt), value, value);
string_freez(rt->type);
}
@@ -1122,8 +1129,8 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
alert_cfg->every = string_strdupz(value);
if(!config_parse_duration(value, &rt->update_every))
- error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
- line, filename, rrdcalctemplate_name(rt), key, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
+ line, filename, rrdcalctemplate_name(rt), key, value);
alert_cfg->p_update_every = rt->update_every;
}
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
@@ -1131,8 +1138,8 @@ static int health_readfile(const char *filename, void *data) {
char *e;
rt->green = str2ndd(value, &e);
if(e && *e) {
- error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rrdcalctemplate_name(rt), key, e);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
+ line, filename, rrdcalctemplate_name(rt), key, e);
}
}
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
@@ -1140,8 +1147,8 @@ static int health_readfile(const char *filename, void *data) {
char *e;
rt->red = str2ndd(value, &e);
if(e && *e) {
- error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rrdcalctemplate_name(rt), key, e);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
+ line, filename, rrdcalctemplate_name(rt), key, e);
}
}
else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
@@ -1150,8 +1157,8 @@ static int health_readfile(const char *filename, void *data) {
int error = 0;
rt->calculation = expression_parse(value, &failed_at, &error);
if(!rt->calculation) {
- error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
+ line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
}
parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE);
}
@@ -1161,8 +1168,8 @@ static int health_readfile(const char *filename, void *data) {
int error = 0;
rt->warning = expression_parse(value, &failed_at, &error);
if(!rt->warning) {
- error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
+ line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
}
parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE);
}
@@ -1172,8 +1179,8 @@ static int health_readfile(const char *filename, void *data) {
int error = 0;
rt->critical = expression_parse(value, &failed_at, &error);
if(!rt->critical) {
- error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
+ line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
}
parse_variables_and_store_in_health_rrdvars(value, HEALTH_CONF_MAX_LINE);
}
@@ -1181,8 +1188,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->exec = string_strdupz(value);
if(rt->exec) {
if(strcmp(rrdcalctemplate_exec(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_exec(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_exec(rt), value, value);
string_freez(rt->exec);
}
@@ -1192,8 +1199,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->to = string_strdupz(value);
if(rt->recipient) {
if(strcmp(rrdcalctemplate_recipient(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_recipient(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_recipient(rt), value, value);
string_freez(rt->recipient);
}
@@ -1205,8 +1212,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->units = string_strdupz(value);
if(rt->units) {
if(strcmp(rrdcalctemplate_units(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_units(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_units(rt), value, value);
string_freez(rt->units);
}
@@ -1218,8 +1225,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->info = string_strdupz(value);
if(rt->info) {
if(strcmp(rrdcalctemplate_info(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_info(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_info(rt), value, value);
string_freez(rt->info);
}
@@ -1243,8 +1250,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->host_labels = string_strdupz(value);
if(rt->host_labels) {
if(strcmp(rrdcalctemplate_host_labels(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_host_labels(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_host_labels(rt), value, value);
string_freez(rt->host_labels);
simple_pattern_free(rt->host_labels_pattern);
@@ -1263,8 +1270,8 @@ static int health_readfile(const char *filename, void *data) {
alert_cfg->chart_labels = string_strdupz(value);
if(rt->chart_labels) {
if(strcmp(rrdcalctemplate_chart_labels(rt), value) != 0)
- error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_chart_labels(rt), value, value);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_chart_labels(rt), value, value);
string_freez(rt->chart_labels);
simple_pattern_free(rt->chart_labels_pattern);
@@ -1281,13 +1288,13 @@ static int health_readfile(const char *filename, void *data) {
SIMPLE_PATTERN_EXACT, true);
}
else {
- error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
- line, filename, rrdcalctemplate_name(rt), key);
+ netdata_log_error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
+ line, filename, rrdcalctemplate_name(rt), key);
}
}
else {
- error("Health configuration at line %zu of file '%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
- line, filename, key);
+ netdata_log_error("Health configuration at line %zu of file '%s' has unknown key '%s'. Expected either '" HEALTH_ALARM_KEY "' or '" HEALTH_TEMPLATE_KEY "'.",
+ line, filename, key);
}
}
@@ -1321,7 +1328,7 @@ void sql_refresh_hashes(void)
void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath) {
if(unlikely((!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) ||
!service_running(SERVICE_HEALTH)) {
- debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", rrdhost_hostname(host));
+ netdata_log_debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", rrdhost_hostname(host));
return;
}
@@ -1329,7 +1336,7 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path
CONFIG_BOOLEAN_YES);
if (!stock_enabled) {
- log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host));
stock_path = user_path;
}
@@ -1337,6 +1344,6 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path
health_rrdvars = health_rrdvariables_create();
recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0);
- log_health("[%s]: Read health configuration.", rrdhost_hostname(host));
+ netdata_log_health("[%s]: Read health configuration.", rrdhost_hostname(host));
sql_store_hashes = 0;
}
diff --git a/health/health_json.c b/health/health_json.c
index 4f81998f0..1da0f5972 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -167,10 +167,6 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
buffer_strcat(wb, "\t\t}");
}
-//void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
-//
-//}
-
void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCALC_STATUS status) {
RRDCALC *rc;
int numberOfAlarms = 0;
diff --git a/health/health_log.c b/health/health_log.c
index b62e0ace4..4cfbee608 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -35,9 +35,9 @@ inline ALARM_ENTRY* health_create_alarm_entry(
STRING *units,
STRING *info,
int delay,
- uint32_t flags
+ HEALTH_ENTRY_FLAGS flags
) {
- debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
+ netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
ae->name = string_dup(name);
@@ -47,6 +47,7 @@ inline ALARM_ENTRY* health_create_alarm_entry(
uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id));
uuid_generate_random(ae->transition_id);
+ ae->global_id = now_realtime_usec();
ae->family = string_dup(family);
ae->classification = string_dup(class);
@@ -88,19 +89,19 @@ inline void health_alarm_log_add_entry(
RRDHOST *host,
ALARM_ENTRY *ae
) {
- debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
+ netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
__atomic_add_fetch(&host->health_transitions, 1, __ATOMIC_RELAXED);
// link it
- netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_lock(&host->health_log.spinlock);
ae->next = host->health_log.alarms;
host->health_log.alarms = ae;
host->health_log.count++;
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_unlock(&host->health_log.spinlock);
// match previous alarms
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_lock(&host->health_log.spinlock);
ALARM_ENTRY *t;
for(t = host->health_log.alarms ; t ; t = t->next) {
if(t != ae && t->alarm_id == ae->alarm_id) {
@@ -120,7 +121,7 @@ inline void health_alarm_log_add_entry(
break;
}
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
health_alarm_log_save(host, ae);
}
@@ -144,7 +145,7 @@ inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
}
inline void health_alarm_log_free(RRDHOST *host) {
- netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_lock(&host->health_log.spinlock);
ALARM_ENTRY *ae;
while((ae = host->health_log.alarms)) {
@@ -152,5 +153,5 @@ inline void health_alarm_log_free(RRDHOST *host) {
health_alarm_log_free_one_nochecks_nounlink(ae);
}
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ rw_spinlock_write_unlock(&host->health_log.spinlock);
}
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 51c000218..3cff33db9 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -3,7 +3,7 @@
# netdata
# real-time performance and health monitoring, done right!
-# (C) 2017 Costa Tsaousis <costa@tsaousis.gr>
+# (C) 2023 Netdata Inc.
# SPDX-License-Identifier: GPL-3.0-or-later
#
# Script to send alarm notifications for netdata
@@ -246,7 +246,8 @@ else
total_crit_alarms="${26}" # List of alarms in critical state
classification="${27}" # The class field from .conf files
edit_command_line="${28}" # The command to edit the alarm, with the line number
- child_machine_guid="${29}" # If populated, the notification is sent for a child
+ child_machine_guid="${29}" # the machine_guid of the child
+ transition_id="${30}" # the transition_id of the alert
fi
# -----------------------------------------------------------------------------
@@ -768,6 +769,15 @@ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then
fi
fi
+# if we need nc, check for the nc command
+if [ "${SEND_IRC}" = "YES" ] && [ -z "${nc}" ]; then
+ nc="$(command -v nc 2>/dev/null)"
+ if [ -z "${nc}" ]; then
+ debug "Cannot find nc command in the system path. Disabling IRC notifications."
+ SEND_IRC="NO"
+ fi
+fi
+
if [ ${dump_methods} ]; then
for name in "${!SEND_@}"; do
if [ "${!name}" = "YES" ]; then
@@ -1913,7 +1923,7 @@ send_irc() {
SNDMESSAGE="${MESSAGE//$'\n'/", "}"
for CHANNEL in ${CHANNELS}; do
error=0
- send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" "${PORT}")
+ send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | ${nc} "${NETWORK}" "${PORT}")
reply_codes=$(echo "${send_alarm}" | cut -d ' ' -f 2 | grep -o '[0-9]*')
for code in ${reply_codes}; do
if [ "${code}" -ge 400 ] && [ "${code}" -le 599 ]; then
@@ -2479,31 +2489,17 @@ urlencode "${value_string}" >/dev/null
url_value_string="${REPLY}"
redirect_params="host=${url_host}&chart=${url_chart}&family=${url_family}&alarm=${url_name}&alarm_unique_id=${unique_id}&alarm_id=${alarm_id}&alarm_event_id=${event_id}&alarm_when=${when}&alarm_status=${status}&alarm_chart=${chart}&alarm_value=${url_value_string}"
-GOTOCLOUD=0
-
-if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then
- if [ -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then
- if [ -f "@registrydir_POST@/netdata.public.unique.id" ]; then
- NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")"
- fi
- fi
- if [ -n "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then
- GOTOCLOUD=1
- fi
-fi
-if [ ${GOTOCLOUD} -eq 0 ]; then
- goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}"
-else
- # Temporarily disable alarm redirection, as the cloud endpoint no longer exists. This functionality will be restored after discussion on #9487. For now, just lead to netdata.cloud
- # Re-allow alarm redirection, for alarms 2.0, new template
- if [ -z "${child_machine_guid}" ]; then
- goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
+if [ -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then
+ if [ -f "@registrydir_POST@/netdata.public.unique.id" ]; then
+ NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")"
else
- goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentId=${NETDATA_REGISTRY_UNIQUE_ID}&childId=${child_machine_guid}&${redirect_params}"
+ error "failed to identify this agent via its NETDATA_REGISTRY_UNIQUE_ID."
fi
fi
+goto_url="${NETDATA_REGISTRY_URL}/registry-alert-redirect.html?agent_machine_guid=${NETDATA_REGISTRY_UNIQUE_ID}&host_machine_guid=${child_machine_guid}&transition_id=${transition_id}&${redirect_params}"
+
# the severity of the alarm
severity="${status}"