summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2022-11-30 18:47:00 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2022-11-30 18:47:00 +0000
commit03bf87dcb06f7021bfb2df2fa8691593c6148aff (patch)
treee16b06711a2ed77cafb4b7754be0220c3d14a9d7 /health
parentAdding upstream version 1.36.1. (diff)
downloadnetdata-upstream/1.37.0.tar.xz
netdata-upstream/1.37.0.zip
Adding upstream version 1.37.0.upstream/1.37.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am3
-rw-r--r--health/REFERENCE.md38
-rw-r--r--health/health.c1431
-rw-r--r--health/health.d/dns_query.conf17
-rw-r--r--health/health.d/go.d.plugin.conf2
-rw-r--r--health/health.d/ml.conf21
-rw-r--r--health/health.d/mysql.conf34
-rw-r--r--health/health.d/nvme.conf15
-rw-r--r--health/health.d/pihole.conf23
-rw-r--r--health/health.d/ping.conf50
-rw-r--r--health/health.d/postgres.conf214
-rw-r--r--health/health.d/python.d.plugin.conf2
-rw-r--r--health/health.d/redis.conf29
-rw-r--r--health/health.d/systemdunits.conf105
-rw-r--r--health/health.d/tcp_resets.conf4
-rw-r--r--health/health.d/timex.conf2
-rw-r--r--health/health.h116
-rw-r--r--health/health_config.c705
-rw-r--r--health/health_json.c157
-rw-r--r--health/health_log.c316
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in60
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf5
22 files changed, 1958 insertions, 1391 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 777b35858..7c8d7f9d2 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -65,8 +65,11 @@ dist_healthconfig_DATA = \
health.d/mysql.conf \
health.d/net.conf \
health.d/netfilter.conf \
+ health.d/nvme.conf \
health.d/nut.conf \
health.d/pihole.conf \
+ health.d/ping.conf \
+ health.d/postgres.conf \
health.d/portcheck.conf \
health.d/processes.conf \
health.d/python.d.plugin.conf \
diff --git a/health/REFERENCE.md b/health/REFERENCE.md
index d1af74767..90da4102a 100644
--- a/health/REFERENCE.md
+++ b/health/REFERENCE.md
@@ -536,12 +536,48 @@ See our [simple patterns docs](/libnetdata/simple_pattern/README.md) for more ex
#### Alarm line `info`
-The info field can contain a small piece of text describing the alarm or template. This will be rendered in notifications and UI elements whenever the specific alarm is in focus. An example for the `ram_available` alarm is:
+The info field can contain a small piece of text describing the alarm or template. This will be rendered in
+notifications and UI elements whenever the specific alarm is in focus. An example for the `ram_available` alarm is:
```yaml
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
```
+info fields can contain special variables in their text that will be replaced during run-time to provide more specific
+alert information. Current variables supported are:
+
+| variable | description |
+| ---------| ----------- |
+| $family | Will be replaced by the family instance for the alert (e.g. eth0) |
+| $label: | Followed by a chart label name, this will replace the variable with the chart label's value |
+
+For example, an info field like the following:
+
+```yaml
+info: average inbound utilization for the network interface $family over the last minute
+```
+
+Will be rendered on the alert acting on interface `eth0` as:
+
+```yaml
+info: average inbound utilization for the network interface eth0 over the last minute
+```
+
+An alert acting on a chart that has a chart label named e.g. `target`, with a value of `https://netdata.cloud/`,
+can be enriched as follows:
+
+```yaml
+info: average ratio of HTTP responses with unexpected status over the last 5 minutes for the site $label:target
+```
+
+Will become:
+
+```yaml
+info: average ratio of HTTP responses with unexpected status over the last 5 minutes for the site https://netdata.cloud/
+```
+
+> Please note that variable names are case sensitive.
+
## Expressions
Netdata has an internal [infix expression parser](/libnetdata/eval). This parses expressions and creates an internal
diff --git a/health/health.c b/health/health.c
index 9eb36a9c6..3784e0f31 100644
--- a/health/health.c
+++ b/health/health.c
@@ -2,11 +2,166 @@
#include "health.h"
+#define WORKER_HEALTH_JOB_RRD_LOCK 0
+#define WORKER_HEALTH_JOB_HOST_LOCK 1
+#define WORKER_HEALTH_JOB_DB_QUERY 2
+#define WORKER_HEALTH_JOB_CALC_EVAL 3
+#define WORKER_HEALTH_JOB_WARNING_EVAL 4
+#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
+#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
+#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
+#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8
+#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10
+#endif
+
+static bool prepare_command(BUFFER *wb,
+ const char *exec,
+ const char *recipient,
+ const char *registry_hostname,
+ uint32_t unique_id,
+ uint32_t alarm_id,
+ uint32_t alarm_event_id,
+ uint32_t when,
+ const char *alert_name,
+ const char *alert_chart_name,
+ const char *alert_family,
+ const char *new_status,
+ const char *old_status,
+ NETDATA_DOUBLE new_value,
+ NETDATA_DOUBLE old_value,
+ const char *alert_source,
+ uint32_t duration,
+ uint32_t non_clear_duration,
+ const char *alert_units,
+ const char *alert_info,
+ const char *new_value_string,
+ const char *old_value_string,
+ const char *source,
+ const char *error_msg,
+ int n_warn,
+ int n_crit,
+ const char *warn_alarms,
+ const char *crit_alarms,
+ const char *classification,
+ const char *edit_command,
+ const char *machine_guid)
+{
+ char buf[8192];
+ size_t n = 8192 - 1;
+
+ buffer_strcat(wb, "exec");
+
+ if (!sanitize_command_argument_string(buf, exec, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, recipient, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, registry_hostname, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%u'", unique_id);
+
+ buffer_sprintf(wb, " '%u'", alarm_id);
+
+ buffer_sprintf(wb, " '%u'", alarm_event_id);
+
+ buffer_sprintf(wb, " '%u'", when);
+
+ if (!sanitize_command_argument_string(buf, alert_name, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_chart_name, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_family, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, new_status, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, old_status, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
+
+ buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
+
+ if (!sanitize_command_argument_string(buf, alert_source, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%u'", duration);
+
+ buffer_sprintf(wb, " '%u'", non_clear_duration);
+
+ if (!sanitize_command_argument_string(buf, alert_units, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_info, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, new_value_string, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, old_value_string, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, source, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, error_msg, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%d'", n_warn);
+
+ buffer_sprintf(wb, " '%d'", n_crit);
+
+ if (!sanitize_command_argument_string(buf, warn_alarms, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, crit_alarms, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, classification, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, edit_command, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, machine_guid, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ return true;
+}
+
unsigned int default_health_enabled = 1;
char *silencers_filename;
// the queue of executed alarm notifications that haven't been waited for yet
-static struct {
+static __thread struct {
ALARM_ENTRY *head; // oldest
ALARM_ENTRY *tail; // latest
} alarm_notifications_in_progress = {NULL, NULL};
@@ -146,77 +301,51 @@ void health_init(void) {
* @param host the structure of the host that the function will reload the configuration.
*/
static void health_reload_host(RRDHOST *host) {
- if(unlikely(!host->health_enabled))
+ if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH))
return;
+ log_health("[%s]: Reloading health.", rrdhost_hostname(host));
+
char *user_path = health_user_config_dir();
char *stock_path = health_stock_config_dir();
// free all running alarms
- rrdhost_wrlock(host);
-
- while(host->templates)
- rrdcalctemplate_unlink_and_free(host, host->templates);
-
- RRDCALCTEMPLATE *rt,*next;
- for(rt = host->alarms_template_with_foreach; rt ; rt = next) {
- next = rt->next;
- rrdcalctemplate_free(rt);
- }
- host->alarms_template_with_foreach = NULL;
-
- while(host->alarms)
- rrdcalc_unlink_and_free(host, host->alarms);
-
- RRDCALC *rc,*nc;
- for(rc = host->alarms_with_foreach; rc ; rc = nc) {
- nc = rc->next;
- rrdcalc_free(rc);
- }
- host->alarms_with_foreach = NULL;
-
- rrdhost_unlock(host);
+ rrdcalc_delete_all(host);
+ rrdcalctemplate_delete_all(host);
// invalidate all previous entries in the alarm log
+ netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *t;
for(t = host->health_log.alarms ; t ; t = t->next) {
if(t->new_status != RRDCALC_STATUS_REMOVED)
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
}
+ netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
- rrdhost_rdlock(host);
// reset all thresholds to all charts
RRDSET *st;
rrdset_foreach_read(st, host) {
st->green = NAN;
st->red = NAN;
}
- rrdhost_unlock(host);
+ rrdset_foreach_done(st);
// load the new alarms
- rrdhost_wrlock(host);
health_readdir(host, user_path, stock_path, NULL);
//Discard alarms with labels that do not apply to host
- rrdcalc_labels_unlink_alarm_from_host(host);
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
// link the loaded alarms to their charts
- RRDDIM *rd;
rrdset_foreach_write(st, host) {
if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
continue;
- rrdsetcalc_link_matching(st);
- rrdcalctemplate_link_matching(st);
- //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it.
- rrdset_rdlock(st);
- rrddim_foreach_read(rd, st) {
- rrdcalc_link_to_rrddim(rd, st, host);
- }
- rrdset_unlock(st);
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
}
-
- rrdhost_unlock(host);
+ rrdset_foreach_done(st);
+ host->aclk_alert_reloaded = 1;
}
/**
@@ -234,11 +363,6 @@ void health_reload(void) {
health_reload_host(host);
rrd_unlock();
-#ifdef ENABLE_ACLK
- if (netdata_cloud_setting) {
- aclk_alert_reloaded = 1;
- }
-#endif
}
// ----------------------------------------------------------------------------
@@ -250,7 +374,6 @@ static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
return RRDCALC_STATUS_CLEAR;
}
-#define ALARM_EXEC_COMMAND_LENGTH 8192
#define ACTIVE_ALARMS_LIST_EXAMINE 500
#define ACTIVE_ALARMS_LIST 15
@@ -266,13 +389,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
// do not send notifications for internal statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
// do not send notifications for disabled statuses
- debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
// mark it as run, so that we will send the same alarm if it happens again
goto done;
}
@@ -292,7 +416,9 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// we have executed this alarm notification in the past
if(t && t->new_status == ae->new_status) {
// don't send the notification for the same status again
- debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name
+ debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
+ , rrdcalc_status2string(ae->new_status));
+ log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae)
, rrdcalc_status2string(ae->new_status));
goto done;
}
@@ -303,7 +429,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
- , ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
}
@@ -312,14 +438,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
// Check if alarm notifications are silenced
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
- info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status));
+ log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
goto done;
}
- static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
+ log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
- const char *exec = (ae->exec) ? ae->exec : host->health_default_exec;
- const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient;
+ const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec);
+ const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient);
int n_warn=0, n_crit=0;
RRDCALC *rc;
@@ -330,13 +456,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
+ if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
+ break;
+
if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
active_alerts[n_warn+n_crit].status = rc->status;
n_warn++;
@@ -344,7 +473,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
active_alerts[n_warn+n_crit].status = rc->status;
n_crit++;
@@ -355,6 +484,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
expr = rc->warning;
}
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
if (n_warn+n_crit>1)
qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
@@ -379,51 +509,55 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
}
}
- char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
-
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
- "' '" NETDATA_DOUBLE_FORMAT_ZERO
- "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
- exec,
- recipient,
- host->registry_hostname,
- ae->unique_id,
- ae->alarm_id,
- ae->alarm_event_id,
- (unsigned long)ae->when,
- ae->name,
- ae->chart?ae->chart:"NOCHART",
- ae->family?ae->family:"NOFAMILY",
- rrdcalc_status2string(ae->new_status),
- rrdcalc_status2string(ae->old_status),
- ae->new_value,
- ae->old_value,
- ae->source?ae->source:"UNKNOWN",
- (uint32_t)ae->duration,
- (uint32_t)ae->non_clear_duration,
- ae->units?ae->units:"",
- ae->info?ae->info:"",
- ae->new_value_string,
- ae->old_value_string,
- (expr && expr->source)?expr->source:"NOSOURCE",
- (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
- n_warn,
- n_crit,
- buffer_tostring(warn_alarms),
- buffer_tostring(crit_alarms),
- ae->classification?ae->classification:"Unknown",
- edit_command,
- host != localhost ? host->machine_guid:""
- );
-
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
- ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
-
- debug(D_HEALTH, "executing command '%s'", command_to_run);
- ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
- ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
- enqueue_alarm_notify_in_progress(ae);
+ char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
+
+ BUFFER *wb = buffer_create(8192);
+ bool ok = prepare_command(wb,
+ exec,
+ recipient,
+ rrdhost_registry_hostname(host),
+ ae->unique_id,
+ ae->alarm_id,
+ ae->alarm_event_id,
+ (unsigned long)ae->when,
+ ae_name(ae),
+ ae->chart?ae_chart_name(ae):"NOCHART",
+ ae->family?ae_family(ae):"NOFAMILY",
+ rrdcalc_status2string(ae->new_status),
+ rrdcalc_status2string(ae->old_status),
+ ae->new_value,
+ ae->old_value,
+ ae->source?ae_source(ae):"UNKNOWN",
+ (uint32_t)ae->duration,
+ (uint32_t)ae->non_clear_duration,
+ ae_units(ae),
+ ae_info(ae),
+ ae_new_value_string(ae),
+ ae_old_value_string(ae),
+ (expr && expr->source)?expr->source:"NOSOURCE",
+ (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
+ n_warn,
+ n_crit,
+ buffer_tostring(warn_alarms),
+ buffer_tostring(crit_alarms),
+ ae->classification?ae_classification(ae):"Unknown",
+ edit_command,
+ host != localhost ? host->machine_guid:"");
+
+ const char *command_to_run = buffer_tostring(wb);
+ if (ok) {
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
+ ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
+
+ debug(D_HEALTH, "executing command '%s'", command_to_run);
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
+ ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
+ enqueue_alarm_notify_in_progress(ae);
+ } else {
+ error("Failed to format command arguments");
+ }
+ buffer_free(wb);
freez(edit_command);
buffer_free(warn_alarms);
buffer_free(crit_alarms);
@@ -450,7 +584,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
- ae->chart?ae->chart:"NOCHART", ae->name,
+ ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
ae->new_value,
rrdcalc_status2string(ae->old_status),
rrdcalc_status2string(ae->new_status)
@@ -467,7 +601,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
- if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
@@ -481,13 +615,13 @@ static inline void health_alarm_log_process(RRDHOST *host) {
}
}
+ netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+
// remember this for the next iteration
host->health_last_processed_id = first_waiting;
bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
- netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
-
if (!cleanup_excess_log_entries)
return;
@@ -508,7 +642,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
- if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
health_alarm_wait_for_execution(ae);
health_alarm_log_free_one_nochecks_nounlink(ae);
host->health_log.count--;
@@ -522,7 +656,7 @@ static inline void health_alarm_log_process(RRDHOST *host) {
static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
if(unlikely(!rc->rrdset)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
@@ -533,40 +667,38 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
*next_run = rc->next_update;
}
- debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now));
+ debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
return 0;
}
if(unlikely(!rc->update_every)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
- debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
return 0;
}
int update_every = rc->rrdset->update_every;
- rrdset_rdlock(rc->rrdset);
- time_t first = rrdset_first_entry_t_nolock(rc->rrdset);
- time_t last = rrdset_last_entry_t_nolock(rc->rrdset);
- rrdset_unlock(rc->rrdset);
+ time_t first = rrdset_first_entry_t(rc->rrdset);
+ time_t last = rrdset_last_entry_t(rc->rrdset);
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
, (unsigned long) last);
return 0;
}
@@ -577,7 +709,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
if(needed + update_every < first || needed - update_every > last) {
debug(D_HEALTH
, "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
- , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
, (unsigned long) last);
return 0;
}
@@ -587,7 +719,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run)
}
static inline int check_if_resumed_from_suspension(void) {
- static usec_t last_realtime = 0, last_monotonic = 0;
+ static __thread usec_t last_realtime = 0, last_monotonic = 0;
usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
int ret = 0;
@@ -603,41 +735,142 @@ static inline int check_if_resumed_from_suspension(void) {
return ret;
}
-static void health_main_cleanup(void *ptr) {
+static void health_thread_cleanup(void *ptr) {
worker_unregister();
- struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
- static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
+ struct health_state *h = ptr;
+ h->host->health_spawn = 0;
+
+ netdata_thread_cancel(netdata_thread_self());
+ log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host));
+ debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host));
+}
+
+static void initialize_health(RRDHOST *host, int is_localhost) {
+ if(!host->health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return;
+ rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH);
+
+ log_health("[%s]: Initializing health.", rrdhost_hostname(host));
+
+ host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
+ host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
+
+ host->health_log.next_log_id = 1;
+ host->health_log.next_alarm_id = 1;
+ host->health_log.max = 1000;
+ host->health_log.next_log_id = (uint32_t)now_realtime_sec();
+ host->health_log.next_alarm_id = 0;
+
+ long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max);
+ if(n < 10) {
+ error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max);
+ config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max);
+ }
+ else
+ host->health_log.max = (unsigned int)n;
+
+ netdata_rwlock_init(&host->health_log.alarm_log_rwlock);
+
+ char filename[FILENAME_MAX + 1];
+
+ if(!is_localhost) {
+ int r = mkdir(host->varlib_dir, 0775);
+ if (r != 0 && errno != EEXIST)
+ error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir);
+ }
+
+ {
+ snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir);
+ int r = mkdir(filename, 0775);
+ if(r != 0 && errno != EEXIST)
+ error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename);
+ }
+ snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir);
+ host->health_log_filename = strdupz(filename);
+
+ snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir);
+ host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename));
+ host->health_default_recipient = string_strdupz("root");
+
+ if (!file_is_migrated(host->health_log_filename)) {
+ int rc = sql_create_health_log_table(host);
+ if (unlikely(rc)) {
+ log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host));
+ health_alarm_log_load(host);
+ health_alarm_log_open(host);
+ }
+ else {
+ health_alarm_log_load(host);
+ add_migrated_file(host->health_log_filename, 0);
+ }
+ } else {
+ // TODO: This needs to go to the metadata thread
+ // Health should wait before accessing the table (needs to be created by the metadata thread)
+ sql_create_health_log_table(host);
+ sql_health_alarm_log_load(host);
+ }
+
+ // ------------------------------------------------------------------------
+ // load health configuration
+
+ health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
- info("cleaning up...");
+ // link the loaded alarms to their charts
+ RRDSET *st;
+ rrdset_foreach_write(st, host) {
+ if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
+ continue;
+
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
+ }
+ rrdset_foreach_done(st);
- static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
+ //Discard alarms with labels that do not apply to host
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
+
+ health_silencers_init();
+}
+
+static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) {
+ time_t now = now_realtime_sec();
+ if(now < next_run) {
+ worker_is_idle();
+ debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
+ while (now < next_run && host->health_enabled && !netdata_exit) {
+ sleep_usec(USEC_PER_SEC);
+ now = now_realtime_sec();
+ }
+ }
+ else {
+ debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+ }
}
-static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
+static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
SILENCER *s;
debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
- rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:"");
+ rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
for (s = silencers->silencers; s!=NULL; s=s->next){
if (
- (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) &&
- (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) &&
+ (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
+ (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
(!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
- (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) &&
- (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family)))
+ (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
+ (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
) {
debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
if (unlikely(silencers->stype == STYPE_NONE)) {
- debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
+ debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
- , rc->name
- , (rc->rrdset)?rc->rrdset->context:""
- , rc->chart
+ , rrdcalc_name(rc)
+ , (rc->rrdset)?rrdset_context(rc->rrdset):""
+ , rrdcalc_chart_name(rc)
, host
- , (rc->rrdset)?rc->rrdset->family:""
+ , (rc->rrdset)?rrdset_family(rc->rrdset):""
);
}
return silencers->stype;
@@ -657,66 +890,86 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers
* @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
*/
static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
- uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
+ uint32_t rrdcalc_flags_old = rc->run_flags;
// Clear the flags
- rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
+ rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
if (unlikely(silencers->all_alarms)) {
- if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
+ if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
+ else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
} else {
- SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers);
- if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED;
- else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED;
+ SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
+ if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
+ else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
}
- if (rrdcalc_flags_old != rc->rrdcalc_flags) {
+ if (rrdcalc_flags_old != rc->run_flags) {
info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
- host->hostname,
- rc->name,
+ rrdhost_hostname(host),
+ rrdcalc_name(rc),
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
+ (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
(rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
- (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
+ (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
);
}
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)
+ if (rc->run_flags & RRDCALC_FLAG_DISABLED)
return 1;
else
return 0;
}
-// Create alarms for dimensions that have been added to charts
-// since the previous iteration.
-static void init_pending_foreach_alarms(RRDHOST *host) {
+static void health_execute_delayed_initializations(RRDHOST *host) {
RRDSET *st;
- RRDDIM *rd;
- if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
- return;
+ if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return;
+ rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION);
- rrdhost_wrlock(host);
+ rrdset_foreach_reentrant(st, host) {
+ if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
+ rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION);
- rrdset_foreach_write(st, host) {
- if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
- continue;
+ worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET);
+
+ if(!st->rrdfamily)
+ st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st));
+
+ if(!st->rrdvars)
+ st->rrdvars = rrdvariables_create();
+
+ rrddimvar_index_init(st);
- rrdset_rdlock(st);
+ rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE);
+ rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE);
+ rrdcalc_link_matching_alerts_to_rrdset(st);
+ rrdcalctemplate_link_matching_templates_to_rrdset(st);
+
+ RRDDIM *rd;
rrddim_foreach_read(rd, st) {
- if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM))
- continue;
+ if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue;
+ rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION);
- rrdcalc_link_to_rrddim(rd, st, host);
+ worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM);
- rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM);
- }
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE);
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE);
+ rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE);
- rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
- rrdset_unlock(st);
- }
+ RRDCALCTEMPLATE *rt;
+ foreach_rrdcalctemplate_read(host, rt) {
+ if(!rt->foreach_dimension_pattern)
+ continue;
- rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
- rrdhost_unlock(host);
+ if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
+ rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
+ }
+ foreach_rrdcalctemplate_done(rt);
+ }
+ rrddim_foreach_done(rd);
+ }
+ rrdset_foreach_done(st);
}
/**
@@ -729,19 +982,6 @@ static void init_pending_foreach_alarms(RRDHOST *host) {
* @return It always returns NULL
*/
-#define WORKER_HEALTH_JOB_RRD_LOCK 0
-#define WORKER_HEALTH_JOB_HOST_LOCK 1
-#define WORKER_HEALTH_JOB_DB_QUERY 2
-#define WORKER_HEALTH_JOB_CALC_EVAL 3
-#define WORKER_HEALTH_JOB_WARNING_EVAL 4
-#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
-#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
-#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
-
-#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
-#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
-#endif
-
void *health_main(void *ptr) {
worker_register("HEALTH");
worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
@@ -752,8 +992,14 @@ void *health_main(void *ptr) {
worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
+ worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init");
+ worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init");
- netdata_thread_cleanup_push(health_main_cleanup, ptr);
+ struct health_state *h = ptr;
+ netdata_thread_cleanup_push(health_thread_cleanup, ptr);
+
+ RRDHOST *host = h->host;
+ initialize_health(host, host == localhost);
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
if(min_run_every < 1) min_run_every = 1;
@@ -763,16 +1009,21 @@ void *health_main(void *ptr) {
time_t now = now_realtime_sec();
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
- rrdcalc_labels_unlink();
+ bool health_running_logged = false;
+
+ rrdhost_rdlock(host); //CHECK
+ rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
+ rrdhost_unlock(host);
unsigned int loop = 0;
#ifdef ENABLE_ACLK
unsigned int marked_aclk_reload_loop = 0;
#endif
- while(!netdata_exit) {
+ while(!netdata_exit && host->health_enabled) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
+ now = now_realtime_sec();
int runnable = 0, apply_hibernation_delay = 0;
time_t next_run = now + min_run_every;
RRDCALC *rc;
@@ -780,433 +1031,500 @@ void *health_main(void *ptr) {
if (unlikely(check_if_resumed_from_suspension())) {
apply_hibernation_delay = 1;
- info(
- "Postponing alarm checks for %"PRId64" seconds, "
- "because it seems that the system was just resumed from suspension.",
- (int64_t)hibernation_delay);
+ log_health(
+ "[%s]: Postponing alarm checks for %"PRId64" seconds, "
+ "because it seems that the system was just resumed from suspension.",
+ rrdhost_hostname(host),
+ (int64_t)hibernation_delay);
}
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
- static int logged=0;
+ static __thread int logged=0;
if (!logged) {
- info("Skipping health checks, because all alarms are disabled via a %s command.",
- HEALTH_CMDAPI_CMD_DISABLEALL);
+ log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.",
+ rrdhost_hostname(host),
+ HEALTH_CMDAPI_CMD_DISABLEALL);
logged = 1;
}
}
#ifdef ENABLE_ACLK
- if (aclk_alert_reloaded && !marked_aclk_reload_loop)
+ if (host->aclk_alert_reloaded && !marked_aclk_reload_loop)
marked_aclk_reload_loop = loop;
#endif
- worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
- rrd_rdlock();
+ if (unlikely(apply_hibernation_delay)) {
+ log_health(
+ "[%s]: Postponing health checks for %"PRId64" seconds.",
+ rrdhost_hostname(host),
+ (int64_t)hibernation_delay);
- RRDHOST *host;
- rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
+ host->health_delay_up_to = now + hibernation_delay;
+ next_run = now + hibernation_delay;
+ health_sleep(next_run, loop, host);
+ }
+
+ if (unlikely(host->health_delay_up_to)) {
+ if (unlikely(now < host->health_delay_up_to)) {
+ next_run = host->health_delay_up_to;
+ health_sleep(next_run, loop, host);
continue;
+ }
- if (unlikely(apply_hibernation_delay)) {
- info(
- "Postponing health checks for %"PRId64" seconds, on host '%s'.",
- (int64_t)hibernation_delay,
- host->hostname);
+ log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host));
+ host->health_delay_up_to = 0;
+ }
- host->health_delay_up_to = now + hibernation_delay;
+ // wait until cleanup of obsolete charts on children is complete
+ if (host != localhost) {
+ if (unlikely(host->trigger_chart_obsoletion_check == 1)) {
+ log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host));
+ health_sleep(next_run, loop, host);
+ continue;
}
+ }
- if (unlikely(host->health_delay_up_to)) {
- if (unlikely(now < host->health_delay_up_to))
- continue;
+ if (!health_running_logged) {
+ log_health("[%s]: Health is running.", rrdhost_hostname(host));
+ health_running_logged = true;
+ }
- info("Resuming health checks on host '%s'.", host->hostname);
- host->health_delay_up_to = 0;
- }
+ if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
+ sql_health_alarm_log_cleanup(host);
- // wait until cleanup of obsolete charts on children is complete
- if (host != localhost)
- if (unlikely(host->trigger_chart_obsoletion_check == 1))
- continue;
+ health_execute_delayed_initializations(host);
- if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
- sql_health_alarm_log_cleanup(host);
+ worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
- init_pending_foreach_alarms(host);
+ // the first loop is to lookup values from the db
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
- worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
- rrdhost_rdlock(host);
+ rrdcalc_update_info_using_rrdset_labels(rc);
- // the first loop is to lookup values from the db
- for (rc = host->alarms; rc; rc = rc->next) {
+ if (update_disabled_silenced(host, rc))
+ continue;
- if (update_disabled_silenced(host, rc))
- continue;
+ // create an alert removed event if the chart is obsolete and
+ // has stopped being collected for 60 seconds
+ if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
+ rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
+ now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
+ if (!rrdcalc_isrepeating(rc)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
+ time_t now = now_realtime_sec();
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->value,
+ NAN,
+ rc->status,
+ RRDCALC_STATUS_REMOVED,
+ rc->source,
+ rc->units,
+ rc->info,
+ 0,
+ rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
+
+ if (ae) {
+ health_alarm_log_add_entry(host, ae);
+ rc->old_status = rc->status;
+ rc->status = RRDCALC_STATUS_REMOVED;
+ rc->last_status_change = now;
+ rc->last_updated = now;
+ rc->value = NAN;
- // create an alert removed event if the chart is obsolete and
- // has stopped being collected for 60 seconds
- if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
- rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
- now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
- if (!rrdcalc_isrepeating(rc)) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- time_t now = now_realtime_sec();
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
- if (ae) {
- health_alarm_log(host, ae);
- rc->old_status = rc->status;
- rc->status = RRDCALC_STATUS_REMOVED;
- rc->last_status_change = now;
- rc->last_updated = now;
- rc->value = NAN;
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
- sql_queue_alarm_to_aclk(host, ae, 1);
+ if (netdata_cloud_setting && likely(!host->aclk_alert_reloaded))
+ sql_queue_alarm_to_aclk(host, ae, 1);
#endif
- }
}
}
+ }
- if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
- if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE;
- continue;
- }
-
- runnable++;
- rc->old_value = rc->value;
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE;
-
- // ------------------------------------------------------------
- // if there is database lookup, do it
-
- if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
- worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
-
- /* time_t old_db_timestamp = rc->db_before; */
- int value_is_null = 0;
+ if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
+ if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
+ rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
+ continue;
+ }
- int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
- rc->after, rc->before, rc->group, NULL,
- 0, rc->options,
- &rc->db_after,&rc->db_before,
- NULL, NULL, NULL,
- &value_is_null, NULL, 0, 0);
+ runnable++;
+ rc->old_value = rc->value;
+ rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
+
+ // ------------------------------------------------------------
+ // if there is database lookup, do it
+
+ if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
+ worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
+
+ /* time_t old_db_timestamp = rc->db_before; */
+ int value_is_null = 0;
+
+ int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
+ rc->after, rc->before, rc->group, NULL,
+ 0, rc->options,
+ &rc->db_after,&rc->db_before,
+ NULL, NULL, NULL,
+ &value_is_null, NULL, 0, 0,
+ QUERY_SOURCE_HEALTH);
+
+ if (unlikely(ret != 200)) {
+ // database lookup failed
+ rc->value = NAN;
+ rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
+ );
+ } else
+ rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
+
+ /* - RRDCALC_FLAG_DB_STALE not currently used
+ if (unlikely(old_db_timestamp == rc->db_before)) {
+ // database is stale
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
+
+ if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
+ rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
+ error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
+ }
+ }
+ else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
+ rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
+ */
+
+ if (unlikely(value_is_null)) {
+ // collected value is null
+ rc->value = NAN;
+ rc->run_flags |= RRDCALC_FLAG_DB_NAN;
+
+ debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
+ );
+ } else
+ rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
+
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
+ );
+ }
- if (unlikely(ret != 200)) {
- // database lookup failed
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR;
+ // ------------------------------------------------------------
+ // if there is calculation expression, run it
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret
- );
- } else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR;
+ if (unlikely(rc->calculation)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
- /* - RRDCALC_FLAG_DB_STALE not currently used
- if (unlikely(old_db_timestamp == rc->db_before)) {
- // database is stale
+ if (unlikely(!expression_evaluate(rc->calculation))) {
+ // calculation failed
+ rc->value = NAN;
+ rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
+ );
+ } else {
+ rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
- error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
- }
- }
- else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
- */
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
+ NETDATA_DOUBLE_FORMAT
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ rc->calculation->parsed_as, rc->calculation->result,
+ buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
+ );
- if (unlikely(value_is_null)) {
- // collected value is null
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN;
+ rc->value = rc->calculation->result;
+ }
+ }
+ }
+ foreach_rrdcalc_in_rrdhost_done(rc);
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name
- );
- } else
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
+ if (unlikely(runnable && !netdata_exit)) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
+ continue;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->value
- );
+ if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
+ continue;
}
+ RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
+ RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
- // ------------------------------------------------------------
- // if there is calculation expression, run it
+ // --------------------------------------------------------
+ // check the warning expression
- if (unlikely(rc->calculation)) {
- worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
+ if (likely(rc->warning)) {
+ worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
- if (unlikely(!expression_evaluate(rc->calculation))) {
+ if (unlikely(!expression_evaluate(rc->warning))) {
// calculation failed
- rc->value = NAN;
- rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR;
+ rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
- );
+ debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ buffer_tostring(rc->warning->error_msg)
+ );
} else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
-
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
+ rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- rc->calculation->parsed_as, rc->calculation->result,
- buffer_tostring(rc->calculation->error_msg), rc->source
- );
-
- rc->value = rc->calculation->result;
-
- if (rc->local) rc->local->last_updated = now;
- if (rc->family) rc->family->last_updated = now;
- if (rc->hostid) rc->hostid->last_updated = now;
- if (rc->hostname) rc->hostname->last_updated = now;
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
+ rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
+ );
+ warning_status = rrdcalc_value2status(rc->warning->result);
}
}
- }
-
- rrdhost_unlock(host);
-
- if (unlikely(runnable && !netdata_exit)) {
- rrdhost_rdlock(host);
- for (rc = host->alarms; rc; rc = rc->next) {
- if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)))
- continue;
+ // --------------------------------------------------------
+ // check the critical expression
- if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) {
- continue;
- }
- RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
- RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
-
- // --------------------------------------------------------
- // check the warning expression
-
- if (likely(rc->warning)) {
- worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
-
- if (unlikely(!expression_evaluate(rc->warning))) {
- // calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR;
-
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- buffer_tostring(rc->warning->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
- NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
- );
- warning_status = rrdcalc_value2status(rc->warning->result);
- }
- }
+ if (likely(rc->critical)) {
+ worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
- // --------------------------------------------------------
- // check the critical expression
-
- if (likely(rc->critical)) {
- worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
-
- if (unlikely(!expression_evaluate(rc->critical))) {
- // calculation failed
- rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR;
-
- debug(D_HEALTH,
- "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
- host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
- buffer_tostring(rc->critical->error_msg)
- );
- } else {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
- NETDATA_DOUBLE_FORMAT
- ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
- rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
- rc->source
- );
- critical_status = rrdcalc_value2status(rc->critical->result);
- }
- }
-
- // --------------------------------------------------------
- // decide the final alarm status
-
- RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
-
- switch (warning_status) {
- case RRDCALC_STATUS_CLEAR:
- status = RRDCALC_STATUS_CLEAR;
- break;
-
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_WARNING;
- break;
+ if (unlikely(!expression_evaluate(rc->critical))) {
+ // calculation failed
+ rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
- default:
- break;
+ debug(D_HEALTH,
+ "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
+ rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
+ buffer_tostring(rc->critical->error_msg)
+ );
+ } else {
+ rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
+ NETDATA_DOUBLE_FORMAT
+ ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
+ rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
+ rrdcalc_source(rc)
+ );
+ critical_status = rrdcalc_value2status(rc->critical->result);
}
+ }
- switch (critical_status) {
- case RRDCALC_STATUS_CLEAR:
- if (status == RRDCALC_STATUS_UNDEFINED)
- status = RRDCALC_STATUS_CLEAR;
- break;
+ // --------------------------------------------------------
+ // decide the final alarm status
- case RRDCALC_STATUS_RAISED:
- status = RRDCALC_STATUS_CRITICAL;
- break;
+ RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
- default:
- break;
- }
+ switch (warning_status) {
+ case RRDCALC_STATUS_CLEAR:
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- // --------------------------------------------------------
- // check if the new status and the old differ
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_WARNING;
+ break;
- if (status != rc->status) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- int delay = 0;
-
- // apply trigger hysteresis
+ default:
+ break;
+ }
- if (now > rc->delay_up_to_timestamp) {
- rc->delay_up_current = rc->delay_up_duration;
- rc->delay_down_current = rc->delay_down_duration;
- rc->delay_last = 0;
- rc->delay_up_to_timestamp = 0;
- } else {
- rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
- if (rc->delay_up_current > rc->delay_max_duration)
- rc->delay_up_current = rc->delay_max_duration;
+ switch (critical_status) {
+ case RRDCALC_STATUS_CLEAR:
+ if (status == RRDCALC_STATUS_UNDEFINED)
+ status = RRDCALC_STATUS_CLEAR;
+ break;
- rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
- if (rc->delay_down_current > rc->delay_max_duration)
- rc->delay_down_current = rc->delay_max_duration;
- }
+ case RRDCALC_STATUS_RAISED:
+ status = RRDCALC_STATUS_CRITICAL;
+ break;
- if (status > rc->status)
- delay = rc->delay_up_current;
- else
- delay = rc->delay_down_current;
+ default:
+ break;
+ }
- // COMMENTED: because we do need to send raising alarms
- // if(now + delay < rc->delay_up_to_timestamp)
- // delay = (int)(rc->delay_up_to_timestamp - now);
+ // --------------------------------------------------------
+ // check if the new status and the old differ
- rc->delay_last = delay;
- rc->delay_up_to_timestamp = now + delay;
+ if (status != rc->status) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
+ int delay = 0;
+ // apply trigger hysteresis
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- health_alarm_log(host, ae);
+ if (now > rc->delay_up_to_timestamp) {
+ rc->delay_up_current = rc->delay_up_duration;
+ rc->delay_down_current = rc->delay_down_duration;
+ rc->delay_last = 0;
+ rc->delay_up_to_timestamp = 0;
+ } else {
+ rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
+ if (rc->delay_up_current > rc->delay_max_duration)
+ rc->delay_up_current = rc->delay_max_duration;
- rc->last_status_change = now;
- rc->old_status = rc->status;
- rc->status = status;
+ rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
+ if (rc->delay_down_current > rc->delay_max_duration)
+ rc->delay_down_current = rc->delay_max_duration;
}
- rc->last_updated = now;
- rc->next_update = now + rc->update_every;
-
- if (next_run > rc->next_update)
- next_run = rc->next_update;
+ if (status > rc->status)
+ delay = rc->delay_up_current;
+ else
+ delay = rc->delay_down_current;
+
+ // COMMENTED: because we do need to send raising alarms
+ // if(now + delay < rc->delay_up_to_timestamp)
+ // delay = (int)(rc->delay_up_to_timestamp - now);
+
+ rc->delay_last = delay;
+ rc->delay_up_to_timestamp = now + delay;
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->old_value,
+ rc->value,
+ rc->status,
+ status,
+ rc->source,
+ rc->units,
+ rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
+ )
+ );
+
+ health_alarm_log_add_entry(host, ae);
+
+ log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status));
+
+ rc->last_status_change = now;
+ rc->old_status = rc->status;
+ rc->status = status;
}
- // process repeating alarms
- RRDCALC *rc;
- for(rc = host->alarms; rc ; rc = rc->next) {
- int repeat_every = 0;
- if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
- if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
- repeat_every = rc->warn_repeat_every;
- } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
- rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE;
- repeat_every = rc->crit_repeat_every;
- } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
- if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
- if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
- repeat_every = 1;
- } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
- repeat_every = 1;
- }
+ rc->last_updated = now;
+ rc->next_update = now + rc->update_every;
+
+ if (next_run > rc->next_update)
+ next_run = rc->next_update;
+ }
+ foreach_rrdcalc_in_rrdhost_done(rc);
+
+ // process repeating alarms
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ int repeat_every = 0;
+ if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
+ rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
+ repeat_every = rc->warn_repeat_every;
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
+ rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
+ repeat_every = rc->crit_repeat_every;
+ } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
+ if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
+ if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
+ repeat_every = 1;
+ } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
+ repeat_every = 1;
}
}
- } else {
- continue;
}
+ } else {
+ continue;
+ }
- if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
- rc->last_repeat = now;
- if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
- ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
- rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
- );
- ae->last_repeat = rc->last_repeat;
- if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
- ae->flags |= HEALTH_ENTRY_RUN_ONCE;
- }
- rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE;
- health_process_notifications(host, ae);
- debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
- health_alarm_wait_for_execution(ae);
- health_alarm_log_free_one_nochecks_nounlink(ae);
+ if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
+ rc->last_repeat = now;
+ if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
+
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host,
+ rc->id,
+ rc->next_event_id++,
+ rc->config_hash_id,
+ now,
+ rc->name,
+ rc->rrdset->id,
+ rc->rrdset->context,
+ rc->rrdset->family,
+ rc->classification,
+ rc->component,
+ rc->type,
+ rc->exec,
+ rc->recipient,
+ now - rc->last_status_change,
+ rc->old_value,
+ rc->value,
+ rc->old_status,
+ rc->status,
+ rc->source,
+ rc->units,
+ rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
+ (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
+ )
+ );
+
+ ae->last_repeat = rc->last_repeat;
+ if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
+ ae->flags |= HEALTH_ENTRY_RUN_ONCE;
}
+ rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
+ health_process_notifications(host, ae);
+ debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ health_alarm_wait_for_execution(ae);
+ health_alarm_log_free_one_nochecks_nounlink(ae);
}
-
- rrdhost_unlock(host);
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
+ }
- if (unlikely(netdata_exit))
- break;
+ if (unlikely(netdata_exit))
+ break;
- // execute notifications
- // and cleanup
- worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
- health_alarm_log_process(host);
+ // execute notifications
+ // and cleanup
+ worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
+ health_alarm_log_process(host);
- if (unlikely(netdata_exit)) {
- // wait for all notifications to finish before allowing health to be cleaned up
- ALARM_ENTRY *ae;
- while (NULL != (ae = alarm_notifications_in_progress.head)) {
- health_alarm_wait_for_execution(ae);
- }
- break;
+ if (unlikely(netdata_exit)) {
+ // wait for all notifications to finish before allowing health to be cleaned up
+ ALARM_ENTRY *ae;
+ while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ health_alarm_wait_for_execution(ae);
}
-
- } /* rrdhost_foreach */
+ break;
+ }
// wait for all notifications to finish before allowing health to be cleaned up
ALARM_ENTRY *ae;
@@ -1215,34 +1533,49 @@ void *health_main(void *ptr) {
}
#ifdef ENABLE_ACLK
- if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
- rrdhost_foreach_read(host) {
- if (unlikely(!host->health_enabled))
- continue;
- sql_queue_removed_alerts_to_aclk(host);
- }
- aclk_alert_reloaded = 0;
- marked_aclk_reload_loop = 0;
- }
+ if (netdata_cloud_setting && unlikely(host->aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
+ sql_queue_removed_alerts_to_aclk(host);
+ host->aclk_alert_reloaded = 0;
+ marked_aclk_reload_loop = 0;
+ }
#endif
- rrd_unlock();
-
if(unlikely(netdata_exit))
break;
- now = now_realtime_sec();
- if(now < next_run) {
- worker_is_idle();
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
- sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
- now = now_realtime_sec();
- }
- else
- debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
+ health_sleep(next_run, loop, host);
} // forever
netdata_thread_cleanup_pop(1);
return NULL;
}
+
+void health_add_host_labels(void) {
+ DICTIONARY *labels = localhost->rrdlabels;
+
+ int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO);
+ rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_CONFIG);
+
+ int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO);
+ rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_CONFIG);
+}
+
+void health_thread_spawn(RRDHOST * host) {
+ if(!host->health_spawn) {
+ char tag[NETDATA_THREAD_TAG_MAX + 1];
+ snprintfz(tag, NETDATA_THREAD_TAG_MAX, "HEALTH[%s]", rrdhost_hostname(host));
+ struct health_state *health = callocz(1, sizeof(*health));
+ health->host = host;
+
+ if(netdata_thread_create(&host->health_thread, tag, NETDATA_THREAD_OPTION_JOINABLE, health_main, (void *) health)) {
+ log_health("[%s]: Failed to create new thread for client.", rrdhost_hostname(host));
+ error("HEALTH [%s]: Failed to create new thread for client.", rrdhost_hostname(host));
+ }
+ else {
+ log_health("[%s]: Created new thread for client.", rrdhost_hostname(host));
+ host->health_spawn = 1;
+ host->aclk_alert_reloaded = 1;
+ }
+ }
+}
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index ec4937c0a..b9d6c2374 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -1,15 +1,14 @@
-
# detect dns query failure
- template: dns_query_time_query_time
- on: dns_query_time.query_time
- class: Latency
+ template: dns_query_query_status
+ on: dns_query.query_status
+ class: Errors
type: DNS
component: DNS
- lookup: average -10s unaligned foreach *
- units: ms
+ calc: $success
+ units: status
every: 10s
- warn: $this == nan
- delay: up 20s down 5m multiplier 1.5 max 1h
- info: average DNS query round trip time over the last 10 seconds
+ warn: $this != nan && $this != 1
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: DNS request type $label:record_type to server $label:server is unsuccessful
to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index a84ab342f..cd87fe0e7 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -3,7 +3,7 @@
template: go.d_job_last_collected_secs
on: netdata.go_plugin_execution_time
- class: Error
+ class: Errors
type: Netdata
component: go.d.plugin
module: !* *
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
index 9bcc81e76..6836ce7b1 100644
--- a/health/health.d/ml.conf
+++ b/health/health.d/ml.conf
@@ -1,10 +1,26 @@
# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly
# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's
# native anomaly detection here:
-# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal
# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+# node level anomaly rate example
+# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate
+# if node level anomaly rate is between 1-5% then warning (pick your own threshold that works best via tial and error).
+# if node level anomaly rate is above 5% then critical (pick your own threshold that works best via tial and error).
+# template: ml_1min_node_ar
+# on: anomaly_detection.anomaly_rate
+# os: linux
+# hosts: *
+# lookup: average -1m foreach anomaly_rate
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (1) : (5))
+# crit: $this > (($status == $CRITICAL) ? (5) : (100))
+# info: rolling 1min node level anomaly rate
+
# alert per dimension example
# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
@@ -33,4 +49,5 @@
# every: 30s
# warn: $this > (($status >= $WARNING) ? (5) : (20))
# crit: $this > (($status == $CRITICAL) ? (20) : (100))
-# info: rolling 5min anomaly rate for system.cpu chart \ No newline at end of file
+# info: rolling 5min anomaly rate for system.cpu chart
+
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 34452d983..3941c71cc 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -114,10 +114,10 @@ component: MySQL
class: Utilization
type: Database
component: MySQL
- lookup: max -2m absolute
+ lookup: max -2m at -1m unaligned
units: nodes
every: 10s
- info: maximum galera cluster size in the last 2 minutes
+ info: maximum galera cluster size in the last 2 minutes starting one minute ago
to: dba
template: mysql_galera_cluster_size
@@ -136,20 +136,29 @@ component: MySQL
# galera node state
- template: mysql_galera_cluster_state
+ template: mysql_galera_cluster_state_warn
on: mysql.galera_cluster_state
class: Errors
type: Database
component: MySQL
- calc: $state
+ calc: $donor + $joined
every: 10s
- warn: $this == 2 OR $this == 3
- crit: $this == 0 OR $this == 1 OR $this >= 5
+ warn: $this != nan AND $this != 0
delay: up 30s down 5m multiplier 1.5 max 1h
- info: galera node state \
- (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
+ info: galera node state is either Donor/Desynced or Joined.
to: dba
+ template: mysql_galera_cluster_state_crit
+ on: mysql.galera_cluster_state
+ class: Errors
+ type: Database
+component: MySQL
+ calc: $undefined + $joining + $error
+ every: 10s
+ crit: $this != nan AND $this != 0
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: galera node state is either Undefined or Joining or Error.
+ to: dba
# galera node status
@@ -158,11 +167,10 @@ component: MySQL
class: Errors
type: Database
component: MySQL
- calc: $wsrep_cluster_status
+ calc: $primary
every: 10s
- crit: $mysql_galera_cluster_state != nan AND $this != 0
+ crit: $this != nan AND $this != 1
delay: up 30s down 5m multiplier 1.5 max 1h
- info: galera node cluster component status \
- (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
- Any other value than primary indicates that the node is part of a nonoperational component.
+ info: galera node is part of a nonoperational component. \
+ This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations.
to: dba
diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf
new file mode 100644
index 000000000..5f729d52b
--- /dev/null
+++ b/health/health.d/nvme.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: nvme_device_critical_warnings_state
+ families: *
+ on: nvme.device_critical_warnings_state
+ class: Errors
+ type: System
+component: Disk
+ lookup: max -30s unaligned
+ units: state
+ every: 10s
+ crit: $this != nan AND $this != 0
+ delay: down 5m multiplier 1.5 max 2h
+ info: NVMe device $label:device has critical warnings
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 2e5c1cbfd..ee6c57cc5 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -15,21 +15,6 @@ component: Pi-hole
info: gravity.list (blocklist) file last update time
to: sysadmin
-# Gravity file check (gravity.list).
-
- template: pihole_blocklist_gravity_file
- on: pihole.blocklist_last_update
- class: Errors
- type: Ad Filtering
-component: Pi-hole
- every: 10s
- units: boolean
- calc: $file_exists
- crit: $this != 1
- delay: up 2m down 5m
- info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists)
- to: sysadmin
-
# Pi-hole's ability to block unwanted domains.
# Should be enabled. The whole point of Pi-hole!
@@ -39,9 +24,9 @@ component: Pi-hole
type: Ad Filtering
component: Pi-hole
every: 10s
- units: boolean
- calc: $enabled
- warn: $this != 1
+ units: status
+ calc: $disabled
+ warn: $this != nan AND $this == 1
delay: up 2m down 5m
- info: unwanted domains blocking status (0: disabled, 1: enabled)
+ info: unwanted domains blocking is disabled
to: sysadmin
diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf
new file mode 100644
index 000000000..cbe7c30c9
--- /dev/null
+++ b/health/health.d/ping.conf
@@ -0,0 +1,50 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: ping_host_reachable
+ families: *
+ on: ping.host_packet_loss
+ class: Errors
+ type: Other
+component: Network
+ lookup: average -30s unaligned of loss
+ calc: $this != nan AND $this < 100
+ units: up/down
+ every: 10s
+ crit: $this == 0
+ delay: down 30m multiplier 1.5 max 2h
+ info: network host $label:host reachability status
+ to: sysadmin
+
+ template: ping_packet_loss
+ families: *
+ on: ping.host_packet_loss
+ class: Errors
+ type: Other
+component: Network
+ lookup: average -10m unaligned of loss
+ green: 5
+ red: 10
+ units: %
+ every: 10s
+ warn: $this > $green
+ crit: $this > $red
+ delay: down 30m multiplier 1.5 max 2h
+ info: packet loss percentage to the network host $label:host over the last 10 minutes
+ to: sysadmin
+
+ template: ping_host_latency
+ families: *
+ on: ping.host_rtt
+ class: Latency
+ type: Other
+component: Network
+ lookup: average -10s unaligned of avg
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: $this > $green OR $max > $red
+ crit: $this > $red
+ delay: down 30m multiplier 1.5 max 2h
+ info: average latency to the network host $label:host over the last 10 seconds
+ to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 000000000..66d034cfe
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,214 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: postgres_total_connection_utilization
+ on: postgres.connections_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average total connection utilization over the last minute
+ to: dba
+
+ template: postgres_acquired_locks_utilization
+ on: postgres.locks_utilization
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of used
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average acquired locks utilization over the last minute
+ to: dba
+
+ template: postgres_txid_exhaustion_perc
+ on: postgres.txid_exhaustion_perc
+ class: Utilization
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $txid_exhaustion
+ units: %
+ every: 1m
+ warn: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ info: percent towards TXID wraparound
+ to: dba
+
+# Database alarms
+
+ template: postgres_db_cache_io_ratio
+ on: postgres.db_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cache hit ratio in db $label:database over the last minute
+ to: dba
+
+ template: postgres_db_transactions_rollback_ratio
+ on: postgres.db_transactions_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -5m unaligned of rollback
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average aborted transactions percentage in db $label:database over the last five minutes
+ to: dba
+
+ template: postgres_db_deadlocks_rate
+ on: postgres.db_deadlocks_rate
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: sum -1m unaligned of deadlocks
+ units: deadlocks
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of deadlocks detected in db $label:database in the last minute
+ to: dba
+
+# Table alarms
+
+ template: postgres_table_cache_io_ratio
+ on: postgres.table_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cache hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_index_cache_io_ratio
+ on: postgres.table_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average index cache hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_toast_cache_io_ratio
+ on: postgres.table_toast_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average TOAST hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_toast_index_cache_io_ratio
+ on: postgres.table_toast_index_cache_io_ratio
+ class: Workload
+ type: Database
+component: PostgreSQL
+ hosts: *
+ lookup: average -1m unaligned of miss
+ calc: 100 - $this
+ units: %
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (70) : (60))
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average index TOAST hit ratio in db $label:database table $label:table over the last minute
+ to: dba
+
+ template: postgres_table_bloat_size_perc
+ on: postgres.table_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $bloat
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ info: bloat size percentage in db $label:database table $label:table
+ to: dba
+
+ template: postgres_table_last_autovacuum_time
+ on: postgres.table_autovacuum_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: !*
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon
+ to: dba
+
+ template: postgres_table_last_autoanalyze_time
+ on: postgres.table_autoanalyze_since_time
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: !*
+ calc: $time
+ units: seconds
+ every: 1m
+ warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+ info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon
+ to: dba
+
+# Index alarms
+
+ template: postgres_index_bloat_size_perc
+ on: postgres.index_bloat_size_perc
+ class: Errors
+ type: Database
+component: PostgreSQL
+ hosts: *
+ calc: $bloat
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (70) : (80))
+ delay: down 15m multiplier 1.5 max 1h
+ info: bloat size percentage in db $label:database table $label:table index $label:index
+ to: dba
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index e3b3d11cf..0e81a482f 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -3,7 +3,7 @@
template: python.d_job_last_collected_secs
on: netdata.pythond_runtime
- class: Error
+ class: Errors
type: Netdata
component: python.d.plugin
module: !* *
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index cad5230c5..34d00b5df 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -1,3 +1,18 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: redis_connections_rejected
+ families: *
+ on: redis.connections
+ class: Errors
+ type: KV Storage
+component: Redis
+ lookup: sum -1m unaligned of rejected
+ every: 10s
+ units: connections
+ warn: $this > 0
+ info: connections rejected because of maxclients limit in the last minute
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
template: redis_bgsave_broken
families: *
@@ -26,3 +41,17 @@ component: Redis
info: duration of the on-going RDB save operation
delay: down 5m multiplier 1.5 max 1h
to: dba
+
+ template: redis_master_link_down
+ families: *
+ on: redis.master_link_down_since_time
+ class: Errors
+ type: KV Storage
+component: Redis
+ every: 10s
+ calc: $time
+ units: seconds
+ crit: $this != nan AND $this > 0
+ info: time elapsed since the link between master and slave is down
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index 38213a8db..531d62fac 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -1,142 +1,141 @@
-## Check if the are any systemd units in the failed state (crashed).
-## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed.
+# you can disable an alarm notification by setting the 'to' line to: silent
## Service units
- template: systemd_service_units_state
- on: systemd.service_units_state
+ template: systemd_service_unit_failed_state
+ on: systemd.service_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd service units are in the failed state
+ info: systemd service unit in the failed state
to: sysadmin
## Socket units
- template: systemd_socket_units_state
+ template: systemd_socket_unit_failed_state
on: systemd.socket_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd socket units are in the failed state
+ info: systemd socket unit in the failed state
to: sysadmin
## Target units
- template: systemd_target_units_state
+ template: systemd_target_unit_failed_state
on: systemd.target_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd target units are in the failed state
+ info: systemd target unit in the failed state
to: sysadmin
## Path units
- template: systemd_path_units_state
+ template: systemd_path_unit_failed_state
on: systemd.path_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd path units are in the failed state
+ info: systemd path unit in the failed state
to: sysadmin
## Device units
- template: systemd_device_units_state
+ template: systemd_device_unit_failed_state
on: systemd.device_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more the systemd device units are in the failed state
+ info: systemd device unit in the failed state
to: sysadmin
## Mount units
- template: systemd_mount_units_state
+ template: systemd_mount_unit_failed_state
on: systemd.mount_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more the systemd mount units are in the failed state
+ info: systemd mount units in the failed state
to: sysadmin
## Automount units
- template: systemd_automount_units_state
+ template: systemd_automount_unit_failed_state
on: systemd.automount_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd automount units are in the failed state
+ info: systemd automount unit in the failed state
to: sysadmin
## Swap units
- template: systemd_swap_units_state
+ template: systemd_swap_unit_failed_state
on: systemd.swap_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd swap units are in the failed state
+ info: systemd swap units in the failed state
to: sysadmin
## Scope units
- template: systemd_scope_units_state
+ template: systemd_scope_unit_failed_state
on: systemd.scope_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd scope units are in the failed state
+ info: systemd scope units in the failed state
to: sysadmin
## Slice units
- template: systemd_slice_units_state
+ template: systemd_slice_unit_failed_state
on: systemd.slice_unit_state
class: Errors
type: Linux
component: Systemd units
- lookup: max -1s min2max
- units: ok/failed
+ calc: $failed
+ units: state
every: 10s
- warn: $this != nan AND $this == 5
+ warn: $this != nan AND $this == 1
delay: down 5m multiplier 1.5 max 1h
- info: one or more systemd slice units are in the failed state
+ info: systemd slice units in the failed state
to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 35cb6366c..ff116db64 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -26,7 +26,7 @@ component: Network
lookup: average -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
+ warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 20s down 60m multiplier 1.2 max 2h
options: no-clear-notification
info: average number of sent TCP RESETS over the last 10 seconds. \
@@ -60,7 +60,7 @@ component: Network
lookup: average -10s unaligned absolute of AttemptFails
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
+ warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 20s down 60m multiplier 1.2 max 2h
options: no-clear-notification
info: average number of received TCP RESETS over the last 10 seconds. \
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
index 23c18ba10..2e9b1a3cf 100644
--- a/health/health.d/timex.conf
+++ b/health/health.d/timex.conf
@@ -5,7 +5,7 @@
alarm: system_clock_sync_state
on: system.clock_sync_state
os: linux
- class: Error
+ class: Errors
type: System
component: Clock
calc: $state
diff --git a/health/health.h b/health/health.h
index 3e77c12a7..15d8326ee 100644
--- a/health/health.h
+++ b/health/health.h
@@ -14,6 +14,7 @@ extern unsigned int default_health_enabled;
#define HEALTH_ENTRY_FLAG_SILENCED 0x00000010
#define HEALTH_ENTRY_RUN_ONCE 0x00000020
#define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040
+#define HEALTH_ENTRY_FLAG_IS_REPEATING 0x00000080
#define HEALTH_ENTRY_FLAG_SAVED 0x10000000
#define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000
@@ -31,65 +32,72 @@ extern unsigned int default_health_enabled;
extern char *silencers_filename;
-extern void health_init(void);
+void health_init(void);
-extern void health_reload(void);
+void health_reload(void);
-extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, NETDATA_DOUBLE *result);
-extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
-extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
-extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
-extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
+void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
+void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
+void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
+void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);
void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf);
-extern int health_alarm_log_open(RRDHOST *host);
-extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
-extern void health_alarm_log_load(RRDHOST *host);
-
-extern ALARM_ENTRY* health_create_alarm_entry(
- RRDHOST *host,
- uint32_t alarm_id,
- uint32_t alarm_event_id,
- uuid_t config_hash_id,
- time_t when,
- const char *name,
- const char *chart,
- const char *chart_context,
- const char *family,
- const char *classification,
- const char *component,
- const char *type,
- const char *exec,
- const char *recipient,
- time_t duration,
- NETDATA_DOUBLE old_value,
- NETDATA_DOUBLE new_value,
- RRDCALC_STATUS old_status,
- RRDCALC_STATUS new_status,
- const char *source,
- const char *units,
- const char *info,
- int delay,
- uint32_t flags);
-
-extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae);
-
-extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
-extern char *health_user_config_dir(void);
-extern char *health_stock_config_dir(void);
-extern void health_alarm_log_free(RRDHOST *host);
-
-extern void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae);
-
-extern void *health_cmdapi_thread(void *ptr);
-
-extern void health_label_log_save(RRDHOST *host);
-
-extern char *health_edit_command_from_source(const char *source);
-extern void sql_refresh_hashes(void);
-
-extern SIMPLE_PATTERN *health_pattern_from_foreach(char *s);
+int health_alarm_log_open(RRDHOST *host);
+void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
+void health_alarm_log_load(RRDHOST *host);
+
+void health_thread_spawn(RRDHOST *host);
+void health_thread_stop(RRDHOST *host);
+
+ALARM_ENTRY* health_create_alarm_entry(
+ RRDHOST *host,
+ uint32_t alarm_id,
+ uint32_t alarm_event_id,
+ const uuid_t config_hash_id,
+ time_t when,
+ STRING *name,
+ STRING *chart,
+ STRING *chart_context,
+ STRING *family,
+ STRING *classification,
+ STRING *component,
+ STRING *type,
+ STRING *exec,
+ STRING *recipient,
+ time_t duration,
+ NETDATA_DOUBLE old_value,
+ NETDATA_DOUBLE new_value,
+ RRDCALC_STATUS old_status,
+ RRDCALC_STATUS new_status,
+ STRING *source,
+ STRING *units,
+ STRING *info,
+ int delay,
+ uint32_t flags);
+
+void health_alarm_log_add_entry(RRDHOST *host, ALARM_ENTRY *ae);
+
+struct health_state {
+ RRDHOST *host;
+ netdata_thread_t thread;
+};
+
+void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
+char *health_user_config_dir(void);
+char *health_stock_config_dir(void);
+void health_alarm_log_free(RRDHOST *host);
+
+void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae);
+
+void *health_cmdapi_thread(void *ptr);
+
+void health_label_log_save(RRDHOST *host);
+
+char *health_edit_command_from_source(const char *source);
+void sql_refresh_hashes(void);
+
+void health_add_host_labels(void);
#endif //NETDATA_HEALTH_H
diff --git a/health/health_config.c b/health/health_config.c
index e1dd32ab1..f9decfad5 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -33,148 +33,6 @@
#define HEALTH_HOST_LABEL_KEY "host labels"
#define HEALTH_FOREACH_KEY "foreach"
-static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
- if(!rc->chart) {
- error("Health configuration for alarm '%s' does not have a chart", rc->name);
- return 0;
- }
-
- if(!rc->update_every) {
- error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
-
- if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->calculation && !rc->warning && !rc->critical) {
- error("Health configuration for alarm '%s.%s' is useless (no db lookup, no calculation, no warning and no critical expressions)", rc->chart?rc->chart:"NOCHART", rc->name);
- return 0;
- }
-
- if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash))
- return 0;
-
- rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
-
- debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
- ", red " NETDATA_DOUBLE_FORMAT_AUTO
- ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
- rc->chart?rc->chart:"NOCHART",
- rc->name,
- rc->id,
- (rc->exec)?rc->exec:"DEFAULT",
- (rc->recipient)?rc->recipient:"DEFAULT",
- rc->green,
- rc->red,
- (int)rc->group,
- rc->after,
- rc->before,
- rc->options,
- (rc->dimensions)?rc->dimensions:"NONE",
- (rc->foreachdim)?rc->foreachdim:"NONE",
- rc->update_every,
- (rc->calculation)?rc->calculation->parsed_as:"NONE",
- (rc->warning)?rc->warning->parsed_as:"NONE",
- (rc->critical)?rc->critical->parsed_as:"NONE",
- rc->source,
- rc->delay_up_duration,
- rc->delay_down_duration,
- rc->delay_max_duration,
- rc->delay_multiplier,
- rc->warn_repeat_every,
- rc->crit_repeat_every
- );
-
- rrdcalc_add_to_host(host, rc);
-
- return 1;
-}
-
-static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) {
- if(unlikely(!rt->context)) {
- error("Health configuration for template '%s' does not have a context", rt->name);
- return 0;
- }
-
- if(unlikely(!rt->update_every)) {
- error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name);
- return 0;
- }
-
- if(unlikely(!RRDCALCTEMPLATE_HAS_DB_LOOKUP(rt) && !rt->calculation && !rt->warning && !rt->critical)) {
- error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name);
- return 0;
- }
-
- RRDCALCTEMPLATE *t, *last = NULL;
- if(!rt->foreachdim) {
- for (t = host->templates; t ; last = t, t = t->next) {
- if(unlikely(t->hash_name == rt->hash_name
- && !strcmp(t->name, rt->name)
- && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*")
- )) {
- info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
- return 0;
- }
- }
-
- if(likely(last)) {
- last->next = rt;
- }
- else {
- rt->next = host->templates;
- host->templates = rt;
- }
- } else {
- for (t = host->alarms_template_with_foreach; t ; last = t, t = t->next) {
- if(unlikely(t->hash_name == rt->hash_name
- && !strcmp(t->name, rt->name)
- && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*")
- )) {
- info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname);
- return 0;
- }
- }
-
- if(likely(last)) {
- last->next = rt;
- }
- else {
- rt->next = host->alarms_template_with_foreach;
- host->alarms_template_with_foreach = rt;
- }
- }
-
- debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
- ", red " NETDATA_DOUBLE_FORMAT_AUTO
- ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
- rt->name,
- (rt->context)?rt->context:"NONE",
- (rt->exec)?rt->exec:"DEFAULT",
- (rt->recipient)?rt->recipient:"DEFAULT",
- rt->green,
- rt->red,
- (int)rt->group,
- rt->after,
- rt->before,
- rt->options,
- (rt->dimensions)?rt->dimensions:"NONE",
- (rt->foreachdim)?rt->foreachdim:"NONE",
- rt->update_every,
- (rt->calculation)?rt->calculation->parsed_as:"NONE",
- (rt->warning)?rt->warning->parsed_as:"NONE",
- (rt->critical)?rt->critical->parsed_as:"NONE",
- rt->source,
- rt->delay_up_duration,
- rt->delay_down_duration,
- rt->delay_max_duration,
- rt->delay_multiplier,
- rt->warn_repeat_every,
- rt->crit_repeat_every
- );
-
-
- return 1;
-}
-
static inline int health_parse_delay(
size_t line, const char *filename, char *string,
int *delay_up_duration,
@@ -275,7 +133,7 @@ static inline uint32_t health_parse_options(const char *s) {
buf[count] = '\0';
if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear"))
- options |= RRDCALC_FLAG_NO_CLEAR_NOTIFICATION;
+ options |= RRDCALC_OPTION_NO_CLEAR_NOTIFICATION;
else
error("Ignoring unknown alarm option '%s'", buf);
}
@@ -334,13 +192,21 @@ static inline int health_parse_repeat(
*
* @param s the string that will be used to create the simple pattern.
*/
-SIMPLE_PATTERN *health_pattern_from_foreach(char *s) {
+
+static void dimension_remove_pipe_comma(char *str) {
+ while(*str) {
+ if(*str == '|' || *str == ',') *str = ' ';
+ str++;
+ }
+}
+
+static SIMPLE_PATTERN *health_pattern_from_foreach(const char *s) {
char *convert= strdupz(s);
SIMPLE_PATTERN *val = NULL;
+
if(convert) {
dimension_remove_pipe_comma(convert);
val = simple_pattern_create(convert, NULL, SIMPLE_PATTERN_EXACT);
-
freez(convert);
}
@@ -350,18 +216,18 @@ SIMPLE_PATTERN *health_pattern_from_foreach(char *s) {
static inline int health_parse_db_lookup(
size_t line, const char *filename, char *string,
RRDR_GROUPING *group_method, int *after, int *before, int *every,
- uint32_t *options, char **dimensions, char **foreachdim
+ RRDCALC_OPTIONS *options, STRING **dimensions, STRING **foreachdim
) {
debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string);
- if(*dimensions) freez(*dimensions);
- if(*foreachdim) freez(*foreachdim);
+ if(*dimensions) string_freez(*dimensions);
+ if(*foreachdim) string_freez(*foreachdim);
*dimensions = NULL;
*foreachdim = NULL;
*after = 0;
*before = 0;
*every = 0;
- *options = 0;
+ *options = (*options) & RRDCALC_ALL_OPTIONS_EXCLUDING_THE_RRDR_ONES; // preserve rrdcalc options
char *s = string, *key;
@@ -453,7 +319,7 @@ static inline int health_parse_db_lookup(
if(find) {
*find = '\0';
}
- *dimensions = strdupz(s);
+ *dimensions = string_strdupz(s);
}
if(!find) {
@@ -462,7 +328,7 @@ static inline int health_parse_db_lookup(
s = ++find;
}
else if(!strcasecmp(key, HEALTH_FOREACH_KEY )) {
- *foreachdim = strdupz(s);
+ *foreachdim = string_strdupz(s);
break;
}
else {
@@ -474,10 +340,10 @@ static inline int health_parse_db_lookup(
return 1;
}
-static inline char *health_source_file(size_t line, const char *file) {
+static inline STRING *health_source_file(size_t line, const char *file) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%zu@%s", line, file);
- return strdupz(buffer);
+ return string_strdupz(buffer);
}
char *health_edit_command_from_source(const char *source)
@@ -496,7 +362,7 @@ char *health_edit_command_from_source(const char *source)
netdata_configured_user_config_dir,
file_no_path + 1,
temp,
- localhost->registry_hostname);
+ rrdhost_registry_hostname(localhost));
} else
buffer[0] = '\0';
@@ -513,35 +379,35 @@ static inline void strip_quotes(char *s) {
static inline void alert_config_free(struct alert_config *cfg)
{
- freez(cfg->alarm);
- freez(cfg->template_key);
- freez(cfg->os);
- freez(cfg->host);
- freez(cfg->on);
- freez(cfg->families);
- freez(cfg->plugin);
- freez(cfg->module);
- freez(cfg->charts);
- freez(cfg->lookup);
- freez(cfg->calc);
- freez(cfg->warn);
- freez(cfg->crit);
- freez(cfg->every);
- freez(cfg->green);
- freez(cfg->red);
- freez(cfg->exec);
- freez(cfg->to);
- freez(cfg->units);
- freez(cfg->info);
- freez(cfg->classification);
- freez(cfg->component);
- freez(cfg->type);
- freez(cfg->delay);
- freez(cfg->options);
- freez(cfg->repeat);
- freez(cfg->host_labels);
- freez(cfg->p_db_lookup_dimensions);
- freez(cfg->p_db_lookup_method);
+ string_freez(cfg->alarm);
+ string_freez(cfg->template_key);
+ string_freez(cfg->os);
+ string_freez(cfg->host);
+ string_freez(cfg->on);
+ string_freez(cfg->families);
+ string_freez(cfg->plugin);
+ string_freez(cfg->module);
+ string_freez(cfg->charts);
+ string_freez(cfg->lookup);
+ string_freez(cfg->calc);
+ string_freez(cfg->warn);
+ string_freez(cfg->crit);
+ string_freez(cfg->every);
+ string_freez(cfg->green);
+ string_freez(cfg->red);
+ string_freez(cfg->exec);
+ string_freez(cfg->to);
+ string_freez(cfg->units);
+ string_freez(cfg->info);
+ string_freez(cfg->classification);
+ string_freez(cfg->component);
+ string_freez(cfg->type);
+ string_freez(cfg->delay);
+ string_freez(cfg->options);
+ string_freez(cfg->repeat);
+ string_freez(cfg->host_labels);
+ string_freez(cfg->p_db_lookup_dimensions);
+ string_freez(cfg->p_db_lookup_method);
freez(cfg);
}
@@ -670,23 +536,35 @@ static int health_readfile(const char *filename, void *data) {
if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) {
if(rc) {
- if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
- rrdcalc_free(rc);
- }
+ if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalc_free_unused_rrdcalc_loaded_from_config(rc);
+ else
+ rrdcalc_add_from_config(host, rc);
+
// health_add_alarms_loop(host, rc, ignore_this) ;
}
if(rt) {
- if (!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) {
- rrdcalctemplate_free(rt);
- }
+ if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt);
+ else
+ rrdcalctemplate_add_from_config(host, rt);
+
rt = NULL;
}
rc = callocz(1, sizeof(RRDCALC));
rc->next_event_id = 1;
- rc->name = strdupz(value);
- rc->hash = simple_hash(rc->name);
+
+ {
+ char *tmp = strdupz(value);
+ if(rrdvar_fix_name(tmp))
+ error("Health configuration renamed alarm '%s' to '%s'", value, tmp);
+
+ rc->name = string_strdupz(tmp);
+ freez(tmp);
+ }
+
rc->source = health_source_file(line, filename);
rc->green = NAN;
rc->red = NAN;
@@ -700,58 +578,62 @@ static int health_readfile(const char *filename, void *data) {
alert_config_free(alert_cfg);
alert_cfg = callocz(1, sizeof(struct alert_config));
- if(rrdvar_fix_name(rc->name))
- error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
-
- alert_cfg->alarm = strdupz(rc->name);
+ alert_cfg->alarm = string_dup(rc->name);
ignore_this = 0;
}
else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) {
if(rc) {
// health_add_alarms_loop(host, rc, ignore_this) ;
- if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
- rrdcalc_free(rc);
- }
+ if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalc_free_unused_rrdcalc_loaded_from_config(rc);
+ else
+ rrdcalc_add_from_config(host, rc);
rc = NULL;
}
if(rt) {
- if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) {
- rrdcalctemplate_free(rt);
- }
+ if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt);
+ else
+ rrdcalctemplate_add_from_config(host, rt);
}
rt = callocz(1, sizeof(RRDCALCTEMPLATE));
- rt->name = strdupz(value);
- rt->hash_name = simple_hash(rt->name);
+
+ {
+ char *tmp = strdupz(value);
+ if(rrdvar_fix_name(tmp))
+ error("Health configuration renamed template '%s' to '%s'", value, tmp);
+
+ rt->name = string_strdupz(tmp);
+ freez(tmp);
+ }
+
rt->source = health_source_file(line, filename);
rt->green = NAN;
rt->red = NAN;
- rt->delay_multiplier = 1.0;
+ rt->delay_multiplier = (float)1.0;
rt->warn_repeat_every = host->health_default_warn_repeat_every;
rt->crit_repeat_every = host->health_default_crit_repeat_every;
if (alert_cfg)
alert_config_free(alert_cfg);
alert_cfg = callocz(1, sizeof(struct alert_config));
- if(rrdvar_fix_name(rt->name))
- error("Health configuration renamed template '%s' to '%s'", value, rt->name);
-
- alert_cfg->template_key = strdupz(rt->name);
+ alert_cfg->template_key = string_dup(rt->name);
ignore_this = 0;
}
else if(hash == hash_os && !strcasecmp(key, HEALTH_OS_KEY)) {
char *os_match = value;
- if (alert_cfg) alert_cfg->os = strdupz(value);
+ if (alert_cfg) alert_cfg->os = string_strdupz(value);
SIMPLE_PATTERN *os_pattern = simple_pattern_create(os_match, NULL, SIMPLE_PATTERN_EXACT);
- if(!simple_pattern_matches(os_pattern, host->os)) {
+ if(!simple_pattern_matches(os_pattern, rrdhost_os(host))) {
if(rc)
- debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", host->hostname, rc->name, line, filename, os_match);
+ debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, os_match);
if(rt)
- debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", host->hostname, rt->name, line, filename, os_match);
+ debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, os_match);
ignore_this = 1;
}
@@ -760,15 +642,15 @@ static int health_readfile(const char *filename, void *data) {
}
else if(hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) {
char *host_match = value;
- if (alert_cfg) alert_cfg->host = strdupz(value);
+ if (alert_cfg) alert_cfg->host = string_strdupz(value);
SIMPLE_PATTERN *host_pattern = simple_pattern_create(host_match, NULL, SIMPLE_PATTERN_EXACT);
- if(!simple_pattern_matches(host_pattern, host->hostname)) {
+ if(!simple_pattern_matches(host_pattern, rrdhost_hostname(host))) {
if(rc)
- debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", host->hostname, rc->name, line, filename, host_match);
+ debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, host_match);
if(rt)
- debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", host->hostname, rt->name, line, filename, host_match);
+ debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, host_match);
ignore_this = 1;
}
@@ -777,65 +659,68 @@ static int health_readfile(const char *filename, void *data) {
}
else if(rc) {
if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
- alert_cfg->on = strdupz(value);
+ alert_cfg->on = string_strdupz(value);
if(rc->chart) {
- if(strcmp(rc->chart, value) != 0)
+ if(strcmp(rrdcalc_chart_name(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->chart, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_chart_name(rc), value, value);
- freez(rc->chart);
+ string_freez(rc->chart);
}
- rc->chart = strdupz(value);
- rc->hash_chart = simple_hash(rc->chart);
+ rc->chart = string_strdupz(value);
}
else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
- alert_cfg->classification = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->classification = string_strdupz(value);
if(rc->classification) {
- if(strcmp(rc->classification, value) != 0)
+ if(strcmp(rrdcalc_classification(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->classification, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_classification(rc), value, value);
- freez(rc->classification);
+ string_freez(rc->classification);
}
- rc->classification = strdupz(value);
- strip_quotes(rc->classification);
+ rc->classification = string_strdupz(value);
}
else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
- alert_cfg->component = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->component = string_strdupz(value);
if(rc->component) {
- if(strcmp(rc->component, value) != 0)
+ if(strcmp(rrdcalc_component(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->component, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_component(rc), value, value);
- freez(rc->component);
+ string_freez(rc->component);
}
- rc->component = strdupz(value);
- strip_quotes(rc->component);
+ rc->component = string_strdupz(value);
}
else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
- alert_cfg->type = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->type = string_strdupz(value);
if(rc->type) {
- if(strcmp(rc->type, value) != 0)
+ if(strcmp(rrdcalc_type(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->type, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_type(rc), value, value);
- freez(rc->type);
+ string_freez(rc->type);
}
- rc->type = strdupz(value);
- strip_quotes(rc->type);
+ rc->type = string_strdupz(value);
}
else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
- alert_cfg->lookup = strdupz(value);
+ alert_cfg->lookup = string_strdupz(value);
health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before,
- &rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim);
- if(rc->foreachdim) {
- rc->spdim = health_pattern_from_foreach(rc->foreachdim);
- }
+ &rc->update_every, &rc->options, &rc->dimensions, &rc->foreach_dimension);
+
+ if(rc->foreach_dimension)
+ rc->foreach_dimension_pattern = health_pattern_from_foreach(rrdcalc_foreachdim(rc));
+
if (rc->after) {
if (rc->dimensions)
- alert_cfg->p_db_lookup_dimensions = strdupz(rc->dimensions);
+ alert_cfg->p_db_lookup_dimensions = string_dup(rc->dimensions);
if (rc->group)
- alert_cfg->p_db_lookup_method = strdupz(group_method2string(rc->group));
+ alert_cfg->p_db_lookup_method = string_strdupz(group_method2string(rc->group));
alert_cfg->p_db_lookup_options = rc->options;
alert_cfg->p_db_lookup_after = rc->after;
alert_cfg->p_db_lookup_before = rc->before;
@@ -843,248 +728,261 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
- alert_cfg->every = strdupz(value);
+ alert_cfg->every = string_strdupz(value);
if(!config_parse_duration(value, &rc->update_every))
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
- line, filename, rc->name, key, value);
+ line, filename, rrdcalc_name(rc), key, value);
alert_cfg->p_update_every = rc->update_every;
}
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
- alert_cfg->green = strdupz(value);
+ alert_cfg->green = string_strdupz(value);
char *e;
rc->green = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rc->name, key, e);
+ line, filename, rrdcalc_name(rc), key, e);
}
}
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
- alert_cfg->red = strdupz(value);
+ alert_cfg->red = string_strdupz(value);
char *e;
rc->red = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rc->name, key, e);
+ line, filename, rrdcalc_name(rc), key, e);
}
}
else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
- alert_cfg->calc = strdupz(value);
+ alert_cfg->calc = string_strdupz(value);
const char *failed_at = NULL;
int error = 0;
rc->calculation = expression_parse(value, &failed_at, &error);
if(!rc->calculation) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rc->name, key, value, expression_strerror(error), failed_at);
+ line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
}
}
else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
- alert_cfg->warn = strdupz(value);
+ alert_cfg->warn = string_strdupz(value);
const char *failed_at = NULL;
int error = 0;
rc->warning = expression_parse(value, &failed_at, &error);
if(!rc->warning) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rc->name, key, value, expression_strerror(error), failed_at);
+ line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
}
}
else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
- alert_cfg->crit = strdupz(value);
+ alert_cfg->crit = string_strdupz(value);
const char *failed_at = NULL;
int error = 0;
rc->critical = expression_parse(value, &failed_at, &error);
if(!rc->critical) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rc->name, key, value, expression_strerror(error), failed_at);
+ line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at);
}
}
else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
- alert_cfg->exec = strdupz(value);
+ alert_cfg->exec = string_strdupz(value);
if(rc->exec) {
- if(strcmp(rc->exec, value) != 0)
+ if(strcmp(rrdcalc_exec(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->exec, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_exec(rc), value, value);
- freez(rc->exec);
+ string_freez(rc->exec);
}
- rc->exec = strdupz(value);
+ rc->exec = string_strdupz(value);
}
else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
- alert_cfg->to = strdupz(value);
+ alert_cfg->to = string_strdupz(value);
if(rc->recipient) {
- if(strcmp(rc->recipient, value) != 0)
+ if(strcmp(rrdcalc_recipient(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->recipient, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_recipient(rc), value, value);
- freez(rc->recipient);
+ string_freez(rc->recipient);
}
- rc->recipient = strdupz(value);
+ rc->recipient = string_strdupz(value);
}
else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
- alert_cfg->units = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->units = string_strdupz(value);
if(rc->units) {
- if(strcmp(rc->units, value) != 0)
+ if(strcmp(rrdcalc_units(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->units, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_units(rc), value, value);
- freez(rc->units);
+ string_freez(rc->units);
}
- rc->units = strdupz(value);
- strip_quotes(rc->units);
+ rc->units = string_strdupz(value);
}
else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
- alert_cfg->info = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->info = string_strdupz(value);
if(rc->info) {
- if(strcmp(rc->info, value) != 0)
+ if(strcmp(rrdcalc_info(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rc->name, key, rc->info, value, value);
+ line, filename, rrdcalc_name(rc), key, rrdcalc_info(rc), value, value);
- freez(rc->info);
+ string_freez(rc->info);
+ string_freez(rc->original_info);
}
- rc->info = strdupz(value);
- strip_quotes(rc->info);
+ rc->info = string_strdupz(value);
+ rc->original_info = string_dup(rc->info);
}
else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
- alert_cfg->delay = strdupz(value);
+ alert_cfg->delay = string_strdupz(value);
health_parse_delay(line, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier);
}
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
- alert_cfg->options = strdupz(value);
+ alert_cfg->options = string_strdupz(value);
rc->options |= health_parse_options(value);
}
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
- alert_cfg->repeat = strdupz(value);
+ alert_cfg->repeat = string_strdupz(value);
health_parse_repeat(line, filename, value,
&rc->warn_repeat_every,
&rc->crit_repeat_every);
}
else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
- alert_cfg->host_labels = strdupz(value);
+ alert_cfg->host_labels = string_strdupz(value);
if(rc->host_labels) {
- if(strcmp(rc->host_labels, value) != 0)
+ if(strcmp(rrdcalc_host_labels(rc), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
- line, filename, rc->name, key, value, value);
+ line, filename, rrdcalc_name(rc), key, value, value);
- freez(rc->host_labels);
+ string_freez(rc->host_labels);
simple_pattern_free(rc->host_labels_pattern);
}
- rc->host_labels = simple_pattern_trim_around_equal(value);
- rc->host_labels_pattern = simple_pattern_create(rc->host_labels, NULL, SIMPLE_PATTERN_EXACT);
+ {
+ char *tmp = simple_pattern_trim_around_equal(value);
+ rc->host_labels = string_strdupz(tmp);
+ freez(tmp);
+ }
+ rc->host_labels_pattern = simple_pattern_create(rrdcalc_host_labels(rc), NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) {
- alert_cfg->plugin = strdupz(value);
- freez(rc->plugin_match);
+ alert_cfg->plugin = string_strdupz(value);
+ string_freez(rc->plugin_match);
simple_pattern_free(rc->plugin_pattern);
- rc->plugin_match = strdupz(value);
- rc->plugin_pattern = simple_pattern_create(rc->plugin_match, NULL, SIMPLE_PATTERN_EXACT);
+ rc->plugin_match = string_strdupz(value);
+ rc->plugin_pattern = simple_pattern_create(rrdcalc_plugin_match(rc), NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) {
- alert_cfg->module = strdupz(value);
- freez(rc->module_match);
+ alert_cfg->module = string_strdupz(value);
+ string_freez(rc->module_match);
simple_pattern_free(rc->module_pattern);
- rc->module_match = strdupz(value);
- rc->module_pattern = simple_pattern_create(rc->module_match, NULL, SIMPLE_PATTERN_EXACT);
+ rc->module_match = string_strdupz(value);
+ rc->module_pattern = simple_pattern_create(rrdcalc_module_match(rc), NULL, SIMPLE_PATTERN_EXACT);
}
else {
error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
- line, filename, rc->name, key);
+ line, filename, rrdcalc_name(rc), key);
}
}
else if(rt) {
if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) {
- alert_cfg->on = strdupz(value);
+ alert_cfg->on = string_strdupz(value);
if(rt->context) {
- if(strcmp(rt->context, value) != 0)
+ if(strcmp(string2str(rt->context), value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->context, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, string2str(rt->context), value, value);
- freez(rt->context);
+ string_freez(rt->context);
}
- rt->context = strdupz(value);
- rt->hash_context = simple_hash(rt->context);
+ rt->context = string_strdupz(value);
}
else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
- alert_cfg->classification = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->classification = string_strdupz(value);
if(rt->classification) {
- if(strcmp(rt->classification, value) != 0)
+ if(strcmp(rrdcalctemplate_classification(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->classification, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_classification(rt), value, value);
- freez(rt->classification);
+ string_freez(rt->classification);
}
- rt->classification = strdupz(value);
- strip_quotes(rt->classification);
+ rt->classification = string_strdupz(value);
}
else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
- alert_cfg->component = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->component = string_strdupz(value);
if(rt->component) {
- if(strcmp(rt->component, value) != 0)
+ if(strcmp(rrdcalctemplate_component(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->component, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_component(rt), value, value);
- freez(rt->component);
+ string_freez(rt->component);
}
- rt->component = strdupz(value);
- strip_quotes(rt->component);
+ rt->component = string_strdupz(value);
}
else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
- alert_cfg->type = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->type = string_strdupz(value);
if(rt->type) {
- if(strcmp(rt->type, value) != 0)
+ if(strcmp(rrdcalctemplate_type(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->type, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_type(rt), value, value);
- freez(rt->type);
+ string_freez(rt->type);
}
- rt->type = strdupz(value);
- strip_quotes(rt->type);
+ rt->type = string_strdupz(value);
}
else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
- alert_cfg->families = strdupz(value);
- freez(rt->family_match);
+ alert_cfg->families = string_strdupz(value);
+ string_freez(rt->family_match);
simple_pattern_free(rt->family_pattern);
- rt->family_match = strdupz(value);
- rt->family_pattern = simple_pattern_create(rt->family_match, NULL, SIMPLE_PATTERN_EXACT);
+ rt->family_match = string_strdupz(value);
+ rt->family_pattern = simple_pattern_create(rrdcalctemplate_family_match(rt), NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) {
- alert_cfg->plugin = strdupz(value);
- freez(rt->plugin_match);
+ alert_cfg->plugin = string_strdupz(value);
+ string_freez(rt->plugin_match);
simple_pattern_free(rt->plugin_pattern);
- rt->plugin_match = strdupz(value);
- rt->plugin_pattern = simple_pattern_create(rt->plugin_match, NULL, SIMPLE_PATTERN_EXACT);
+ rt->plugin_match = string_strdupz(value);
+ rt->plugin_pattern = simple_pattern_create(rrdcalctemplate_plugin_match(rt), NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) {
- alert_cfg->module = strdupz(value);
- freez(rt->module_match);
+ alert_cfg->module = string_strdupz(value);
+ string_freez(rt->module_match);
simple_pattern_free(rt->module_pattern);
- rt->module_match = strdupz(value);
- rt->module_pattern = simple_pattern_create(rt->module_match, NULL, SIMPLE_PATTERN_EXACT);
+ rt->module_match = string_strdupz(value);
+ rt->module_pattern = simple_pattern_create(rrdcalctemplate_module_match(rt), NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_charts && !strcasecmp(key, HEALTH_CHARTS_KEY)) {
- alert_cfg->charts = strdupz(value);
- freez(rt->charts_match);
+ alert_cfg->charts = string_strdupz(value);
+ string_freez(rt->charts_match);
simple_pattern_free(rt->charts_pattern);
- rt->charts_match = strdupz(value);
- rt->charts_pattern = simple_pattern_create(rt->charts_match, NULL, SIMPLE_PATTERN_EXACT);
+ rt->charts_match = string_strdupz(value);
+ rt->charts_pattern = simple_pattern_create(rrdcalctemplate_charts_match(rt), NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
- alert_cfg->lookup = strdupz(value);
+ alert_cfg->lookup = string_strdupz(value);
health_parse_db_lookup(line, filename, value, &rt->group, &rt->after, &rt->before,
- &rt->update_every, &rt->options, &rt->dimensions, &rt->foreachdim);
- if(rt->foreachdim) {
- rt->spdim = health_pattern_from_foreach(rt->foreachdim);
- }
+ &rt->update_every, &rt->options, &rt->dimensions, &rt->foreach_dimension);
+
+ if(rt->foreach_dimension)
+ rt->foreach_dimension_pattern = health_pattern_from_foreach(rrdcalctemplate_foreachdim(rt));
+
if (rt->after) {
if (rt->dimensions)
- alert_cfg->p_db_lookup_dimensions = strdupz(rt->dimensions);
+ alert_cfg->p_db_lookup_dimensions = string_dup(rt->dimensions);
+
if (rt->group)
- alert_cfg->p_db_lookup_method = strdupz(group_method2string(rt->group));
+ alert_cfg->p_db_lookup_method = string_strdupz(group_method2string(rt->group));
+
alert_cfg->p_db_lookup_options = rt->options;
alert_cfg->p_db_lookup_after = rt->after;
alert_cfg->p_db_lookup_before = rt->before;
@@ -1092,137 +990,143 @@ static int health_readfile(const char *filename, void *data) {
}
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
- alert_cfg->every = strdupz(value);
+ alert_cfg->every = string_strdupz(value);
if(!config_parse_duration(value, &rt->update_every))
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
- line, filename, rt->name, key, value);
+ line, filename, rrdcalctemplate_name(rt), key, value);
alert_cfg->p_update_every = rt->update_every;
}
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
- alert_cfg->green = strdupz(value);
+ alert_cfg->green = string_strdupz(value);
char *e;
rt->green = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rt->name, key, e);
+ line, filename, rrdcalctemplate_name(rt), key, e);
}
}
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
- alert_cfg->red = strdupz(value);
+ alert_cfg->red = string_strdupz(value);
char *e;
rt->red = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
- line, filename, rt->name, key, e);
+ line, filename, rrdcalctemplate_name(rt), key, e);
}
}
else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) {
- alert_cfg->calc = strdupz(value);
+ alert_cfg->calc = string_strdupz(value);
const char *failed_at = NULL;
int error = 0;
rt->calculation = expression_parse(value, &failed_at, &error);
if(!rt->calculation) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rt->name, key, value, expression_strerror(error), failed_at);
+ line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
}
}
else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) {
- alert_cfg->warn = strdupz(value);
+ alert_cfg->warn = string_strdupz(value);
const char *failed_at = NULL;
int error = 0;
rt->warning = expression_parse(value, &failed_at, &error);
if(!rt->warning) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rt->name, key, value, expression_strerror(error), failed_at);
+ line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
}
}
else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) {
- alert_cfg->crit = strdupz(value);
+ alert_cfg->crit = string_strdupz(value);
const char *failed_at = NULL;
int error = 0;
rt->critical = expression_parse(value, &failed_at, &error);
if(!rt->critical) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'",
- line, filename, rt->name, key, value, expression_strerror(error), failed_at);
+ line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at);
}
}
else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) {
- alert_cfg->exec = strdupz(value);
+ alert_cfg->exec = string_strdupz(value);
if(rt->exec) {
- if(strcmp(rt->exec, value) != 0)
+ if(strcmp(rrdcalctemplate_exec(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->exec, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_exec(rt), value, value);
- freez(rt->exec);
+ string_freez(rt->exec);
}
- rt->exec = strdupz(value);
+ rt->exec = string_strdupz(value);
}
else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) {
- alert_cfg->to = strdupz(value);
+ alert_cfg->to = string_strdupz(value);
if(rt->recipient) {
- if(strcmp(rt->recipient, value) != 0)
+ if(strcmp(rrdcalctemplate_recipient(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->recipient, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_recipient(rt), value, value);
- freez(rt->recipient);
+ string_freez(rt->recipient);
}
- rt->recipient = strdupz(value);
+ rt->recipient = string_strdupz(value);
}
else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) {
- alert_cfg->units = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->units = string_strdupz(value);
if(rt->units) {
- if(strcmp(rt->units, value) != 0)
+ if(strcmp(rrdcalctemplate_units(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->units, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_units(rt), value, value);
- freez(rt->units);
+ string_freez(rt->units);
}
- rt->units = strdupz(value);
- strip_quotes(rt->units);
+ rt->units = string_strdupz(value);
}
else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) {
- alert_cfg->info = strdupz(value);
+ strip_quotes(value);
+
+ alert_cfg->info = string_strdupz(value);
if(rt->info) {
- if(strcmp(rt->info, value) != 0)
+ if(strcmp(rrdcalctemplate_info(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->info, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_info(rt), value, value);
- freez(rt->info);
+ string_freez(rt->info);
}
- rt->info = strdupz(value);
- strip_quotes(rt->info);
+ rt->info = string_strdupz(value);
}
else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) {
- alert_cfg->delay = strdupz(value);
+ alert_cfg->delay = string_strdupz(value);
health_parse_delay(line, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier);
}
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
- alert_cfg->options = strdupz(value);
+ alert_cfg->options = string_strdupz(value);
rt->options |= health_parse_options(value);
}
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
- alert_cfg->repeat = strdupz(value);
+ alert_cfg->repeat = string_strdupz(value);
health_parse_repeat(line, filename, value,
&rt->warn_repeat_every,
&rt->crit_repeat_every);
}
else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
- alert_cfg->host_labels = strdupz(value);
+ alert_cfg->host_labels = string_strdupz(value);
if(rt->host_labels) {
- if(strcmp(rt->host_labels, value) != 0)
+ if(strcmp(rrdcalctemplate_host_labels(rt), value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->host_labels, value, value);
+ line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_host_labels(rt), value, value);
- freez(rt->host_labels);
+ string_freez(rt->host_labels);
simple_pattern_free(rt->host_labels_pattern);
}
- rt->host_labels = simple_pattern_trim_around_equal(value);
- rt->host_labels_pattern = simple_pattern_create(rt->host_labels, NULL, SIMPLE_PATTERN_EXACT);
+ {
+ char *tmp = simple_pattern_trim_around_equal(value);
+ rt->host_labels = string_strdupz(tmp);
+ freez(tmp);
+ }
+ rt->host_labels_pattern = simple_pattern_create(rrdcalctemplate_host_labels(rt), NULL, SIMPLE_PATTERN_EXACT);
}
else {
error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
- line, filename, rt->name, key);
+ line, filename, rrdcalctemplate_name(rt), key);
}
}
else {
@@ -1233,15 +1137,17 @@ static int health_readfile(const char *filename, void *data) {
if(rc) {
//health_add_alarms_loop(host, rc, ignore_this) ;
- if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) {
- rrdcalc_free(rc);
- }
+ if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalc_free_unused_rrdcalc_loaded_from_config(rc);
+ else
+ rrdcalc_add_from_config(host, rc);
}
if(rt) {
- if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) {
- rrdcalctemplate_free(rt);
- }
+ if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this)
+ rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt);
+ else
+ rrdcalctemplate_add_from_config(host, rt);
}
if (alert_cfg)
@@ -1257,8 +1163,8 @@ void sql_refresh_hashes(void)
}
void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath) {
- if(unlikely(!host->health_enabled)) {
- debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", host->hostname);
+ if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) {
+ debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", rrdhost_hostname(host));
return;
}
@@ -1266,10 +1172,11 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path
CONFIG_BOOLEAN_YES);
if (!stock_enabled) {
- info("Netdata will not load stock alarms.");
+ log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host));
stock_path = user_path;
}
recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0);
+ log_health("[%s]: Read health configuration.", rrdhost_hostname(host));
sql_store_hashes = 0;
}
diff --git a/health/health_json.c b/health/health_json.c
index 4e8f43761..2dd59fd46 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -14,7 +14,7 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const
}
void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
- char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
+ char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
char config_hash_id[GUID_LEN + 1];
uuid_unparse_lower(ae->config_hash_id, config_hash_id);
@@ -57,30 +57,30 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
"\t\t\"old_value_string\": \"%s\",\n"
"\t\t\"last_repeat\": \"%lu\",\n"
"\t\t\"silenced\": \"%s\",\n"
- , host->hostname
+ , rrdhost_hostname(host)
, host->utc_offset
- , host->abbrev_timezone
+ , rrdhost_abbrev_timezone(host)
, ae->unique_id
, ae->alarm_id
, ae->alarm_event_id
, config_hash_id
- , ae->name
- , ae->chart
- , ae->chart_context
- , ae->family
- , ae->classification?ae->classification:"Unknown"
- , ae->component?ae->component:"Unknown"
- , ae->type?ae->type:"Unknown"
+ , ae_name(ae)
+ , ae_chart_name(ae)
+ , ae_chart_context(ae)
+ , ae_family(ae)
+ , ae->classification?ae_classification(ae):"Unknown"
+ , ae->component?ae_component(ae):"Unknown"
+ , ae->type?ae_type(ae):"Unknown"
, (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
, (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
, (unsigned long)ae->exec_run_timestamp
, (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false"
- , ae->exec?ae->exec:host->health_default_exec
- , ae->recipient?ae->recipient:host->health_default_recipient
+ , ae->exec?ae_exec(ae):string2str(host->health_default_exec)
+ , ae->recipient?ae_recipient(ae):string2str(host->health_default_recipient)
, ae->exec_code
- , ae->source
+ , ae_source(ae)
, edit_command
- , ae->units?ae->units:""
+ , ae_units(ae)
, (unsigned long)ae->when
, (unsigned long)ae->duration
, (unsigned long)ae->non_clear_duration
@@ -90,28 +90,13 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
, (unsigned long)ae->delay_up_to_timestamp
, ae->updated_by_id
, ae->updates_id
- , ae->new_value_string
- , ae->old_value_string
+ , ae_new_value_string(ae)
+ , ae_old_value_string(ae)
, (unsigned long)ae->last_repeat
, (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
);
- char *replaced_info = NULL;
- if (likely(ae->info)) {
- char *m = NULL;
- replaced_info = strdupz(ae->info);
- size_t pos = 0;
- while ((m = strstr(replaced_info + pos, "$family"))) {
- char *buf = NULL;
- pos = m - replaced_info;
- buf = find_and_replace(replaced_info, "$family", ae->family ? ae->family : "", m);
- freez(replaced_info);
- replaced_info = strdupz(buf);
- freez(buf);
- }
- }
-
- health_string2json(wb, "\t\t", "info", replaced_info?replaced_info:"", ",\n");
+ health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n");
if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
@@ -127,22 +112,23 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
buffer_strcat(wb, "\t}");
- freez(replaced_info);
freez(edit_command);
}
void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
- netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
buffer_strcat(wb, "[");
unsigned int max = host->health_log.max;
unsigned int count = 0;
- uint32_t hash_chart = 0;
- if (chart) hash_chart = simple_hash(chart);
+
+ STRING *chart_string = string_strdupz(chart);
+
+ netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
+
ALARM_ENTRY *ae;
for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) {
- if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) {
+ if ((ae->unique_id > after) && (!chart || chart_string == ae->chart)) {
if (likely(count))
buffer_strcat(wb, ",");
health_alarm_entry2json_nolock(wb, ae, host);
@@ -150,9 +136,11 @@ void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *char
}
}
- buffer_strcat(wb, "\n]\n");
-
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+
+ string_freez(chart_string);
+
+ buffer_strcat(wb, "\n]\n");
}
static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
@@ -160,7 +148,7 @@ static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb,
buffer_sprintf(wb,
"\t\t\"%s.%s\": {\n"
"\t\t\t\"id\": %lu,\n"
- , rc->chart, rc->name
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc)
, (unsigned long)rc->id);
buffer_strcat(wb, "\t\t\t\"value\":");
@@ -180,22 +168,7 @@ static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb,
static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
char value_string[100 + 1];
- format_value_and_unit(value_string, 100, rc->value, rc->units, -1);
-
- char *replaced_info = NULL;
- if (likely(rc->info)) {
- char *m;
- replaced_info = strdupz(rc->info);
- size_t pos = 0;
- while ((m = strstr(replaced_info + pos, "$family"))) {
- char *buf = NULL;
- pos = m - replaced_info;
- buf = find_and_replace(replaced_info, "$family", (rc->rrdset && rc->rrdset->family) ? rc->rrdset->family : "", m);
- freez(replaced_info);
- replaced_info = strdupz(buf);
- freez(buf);
- }
- }
+ format_value_and_unit(value_string, 100, rc->value, rrdcalc_units(rc), -1);
char hash_id[GUID_LEN + 1];
uuid_unparse_lower(rc->config_hash_id, hash_id);
@@ -234,23 +207,23 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"value_string\": \"%s\",\n"
"\t\t\t\"last_repeat\": \"%lu\",\n"
"\t\t\t\"times_repeat\": %lu,\n"
- , rc->chart, rc->name
+ , rrdcalc_chart_name(rc), rrdcalc_name(rc)
, (unsigned long)rc->id
, hash_id
- , rc->name
- , rc->chart
- , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
- , rc->classification?rc->classification:"Unknown"
- , rc->component?rc->component:"Unknown"
- , rc->type?rc->type:"Unknown"
+ , rrdcalc_name(rc)
+ , rrdcalc_chart_name(rc)
+ , (rc->rrdset)?rrdset_family(rc->rrdset):""
+ , rc->classification?rrdcalc_classification(rc):"Unknown"
+ , rc->component?rrdcalc_component(rc):"Unknown"
+ , rc->type?rrdcalc_type(rc):"Unknown"
, (rc->rrdset)?"true":"false"
- , (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
- , (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
- , rc->exec?rc->exec:host->health_default_exec
- , rc->recipient?rc->recipient:host->health_default_recipient
- , rc->source
- , rc->units?rc->units:""
- , replaced_info?replaced_info:""
+ , (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
+ , (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
+ , rc->exec?rrdcalc_exec(rc):string2str(host->health_default_exec)
+ , rc->recipient?rrdcalc_recipient(rc):string2str(host->health_default_recipient)
+ , rrdcalc_source(rc)
+ , rrdcalc_units(rc)
+ , rrdcalc_info(rc)
, rrdcalc_status2string(rc->status)
, (unsigned long)rc->last_status_change
, (unsigned long)rc->last_updated
@@ -269,13 +242,13 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, (unsigned long)rc->times_repeat
);
- if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) {
+ if(unlikely(rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)) {
buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
}
if(RRDCALC_HAS_DB_LOOKUP(rc)) {
- if(rc->dimensions && *rc->dimensions)
- health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n");
+ if(rc->dimensions)
+ health_string2json(wb, "\t\t\t", "lookup_dimensions", rrdcalc_dimensions(rc), ",\n");
buffer_sprintf(wb,
"\t\t\t\"db_after\": %lu,\n"
@@ -322,8 +295,6 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
buffer_strcat(wb, "\n");
buffer_strcat(wb, "\t\t}");
-
- freez(replaced_info);
}
//void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
@@ -336,27 +307,30 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL
char *tok = NULL;
char *p = NULL;
- rrdhost_rdlock(host);
-
if (contexts) {
p = (char*)buffer_tostring(contexts);
while(p && *p && (tok = mystrsep(&p, ", |"))) {
if(!*tok) continue;
- for(rc = host->alarms; rc ; rc = rc->next) {
+ STRING *tok_string = string_strdupz(tok);
+
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
continue;
- if(unlikely(rc->rrdset && rc->rrdset->hash_context == simple_hash(tok)
- && !strcmp(rc->rrdset->context, tok)
- && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)))
+ if(unlikely(rc->rrdset
+ && rc->rrdset->context == tok_string
+ && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)))
numberOfAlarms++;
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
+
+ string_freez(tok_string);
}
}
else {
- for(rc = host->alarms; rc ; rc = rc->next) {
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
@@ -364,16 +338,16 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL
if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))
numberOfAlarms++;
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
}
buffer_sprintf(wb, "%d", numberOfAlarms);
- rrdhost_unlock(host);
}
static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) {
RRDCALC *rc;
- int i;
- for(i = 0, rc = host->alarms; rc ; rc = rc->next) {
+ int i = 0;
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
@@ -387,44 +361,43 @@ static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, v
fp(host, wb, rc);
i++;
}
+ foreach_rrdcalc_in_rrdhost_done(rc);
}
void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
- rrdhost_rdlock(host);
buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
"\n\t\"latest_alarm_log_unique_id\": %u,"
"\n\t\"status\": %s,"
"\n\t\"now\": %lu,"
"\n\t\"alarms\": {\n",
- host->hostname,
+ rrdhost_hostname(host),
(host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
host->health_enabled?"true":"false",
(unsigned long)now_realtime_sec());
health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock);
+// rrdhost_rdlock(host);
// buffer_strcat(wb, "\n\t},\n\t\"templates\": {");
// RRDCALCTEMPLATE *rt;
// for(rt = host->templates; rt ; rt = rt->next)
// health_rrdcalctemplate2json_nolock(wb, rt);
+// rrdhost_unlock(host);
buffer_strcat(wb, "\n\t}\n}\n");
- rrdhost_unlock(host);
}
void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) {
- rrdhost_rdlock(host);
buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
"\n\t\"alarms\": {\n",
- host->hostname);
+ rrdhost_hostname(host));
health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock);
buffer_strcat(wb, "\n\t}\n}\n");
- rrdhost_unlock(host);
}
-static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, time_t mark)
+static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark)
{
ALARM_ENTRY *ae = host->health_log.alarms;
diff --git a/health/health_log.c b/health/health_log.c
index f0a05531d..8105e01ae 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -14,11 +14,11 @@ inline int health_alarm_log_open(RRDHOST *host) {
if(host->health_log_fp) {
if (setvbuf(host->health_log_fp, NULL, _IOLBF, 0) != 0)
- error("HEALTH [%s]: cannot set line buffering on health log file '%s'.", host->hostname, host->health_log_filename);
+ error("HEALTH [%s]: cannot set line buffering on health log file '%s'.", rrdhost_hostname(host), host->health_log_filename);
return 0;
}
- error("HEALTH [%s]: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", host->hostname, host->health_log_filename);
+ error("HEALTH [%s]: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", rrdhost_hostname(host), host->health_log_filename);
return -1;
}
@@ -45,13 +45,13 @@ static inline void health_log_rotate(RRDHOST *host) {
snprintfz(old_filename, FILENAME_MAX, "%s.old", host->health_log_filename);
if(unlink(old_filename) == -1 && errno != ENOENT)
- error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, old_filename);
+ error("HEALTH [%s]: cannot remove old alarms log file '%s'", rrdhost_hostname(host), old_filename);
if(link(host->health_log_filename, old_filename) == -1 && errno != ENOENT)
- error("HEALTH [%s]: cannot move file '%s' to '%s'.", host->hostname, host->health_log_filename, old_filename);
+ error("HEALTH [%s]: cannot move file '%s' to '%s'.", rrdhost_hostname(host), host->health_log_filename, old_filename);
if(unlink(host->health_log_filename) == -1 && errno != ENOENT)
- error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, host->health_log_filename);
+ error("HEALTH [%s]: cannot remove old alarms log file '%s'", rrdhost_hostname(host), host->health_log_filename);
// open it with truncate
host->health_log_fp = fopen(host->health_log_filename, "w");
@@ -59,7 +59,7 @@ static inline void health_log_rotate(RRDHOST *host) {
if(host->health_log_fp)
fclose(host->health_log_fp);
else
- error("HEALTH [%s]: cannot truncate health log '%s'", host->hostname, host->health_log_filename);
+ error("HEALTH [%s]: cannot truncate health log '%s'", rrdhost_hostname(host), host->health_log_filename);
host->health_log_fp = NULL;
@@ -75,12 +75,12 @@ inline void health_label_log_save(RRDHOST *host) {
if(unlikely(host->health_log_fp)) {
BUFFER *wb = buffer_create(1024);
- rrdlabels_to_buffer(localhost->host_labels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL);
+ rrdlabels_to_buffer(localhost->rrdlabels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL);
char *write = (char *) buffer_tostring(wb);
if (unlikely(fprintf(host->health_log_fp, "L\t%s", write) < 0))
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.",
- host->hostname, host->health_log_filename);
+ rrdhost_hostname(host), host->health_log_filename);
else
host->health_log_entries_written++;
@@ -103,7 +103,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
"\t%s\t%s\t%s"
"\n"
, (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
- , host->hostname
+ , rrdhost_hostname(host)
, ae->unique_id
, ae->alarm_id
@@ -118,14 +118,14 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
, (uint32_t)ae->exec_run_timestamp
, (uint32_t)ae->delay_up_to_timestamp
- , (ae->name)?ae->name:""
- , (ae->chart)?ae->chart:""
- , (ae->family)?ae->family:""
- , (ae->exec)?ae->exec:""
- , (ae->recipient)?ae->recipient:""
- , (ae->source)?ae->source:""
- , (ae->units)?ae->units:""
- , (ae->info)?ae->info:""
+ , ae_name(ae)
+ , ae_chart_name(ae)
+ , ae_family(ae)
+ , ae_exec(ae)
+ , ae_recipient(ae)
+ , ae_source(ae)
+ , ae_units(ae)
+ , ae_info(ae)
, ae->exec_code
, ae->new_status
@@ -135,11 +135,11 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
, ae->new_value
, ae->old_value
, (uint64_t)ae->last_repeat
- , (ae->classification)?ae->classification:"Unknown"
- , (ae->component)?ae->component:"Unknown"
- , (ae->type)?ae->type:"Unknown"
+ , (ae->classification)?ae_classification(ae):"Unknown"
+ , (ae->component)?ae_component(ae):"Unknown"
+ , (ae->type)?ae_type(ae):"Unknown"
) < 0))
- error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
+ error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", rrdhost_hostname(host), host->health_log_filename);
else {
ae->flags |= HEALTH_ENTRY_FLAG_SAVED;
host->health_log_entries_written++;
@@ -156,18 +156,23 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
static uint32_t is_valid_alarm_id(RRDHOST *host, const char *chart, const char *name, uint32_t alarm_id)
{
- uint32_t hash_chart = simple_hash(chart);
- uint32_t hash_name = simple_hash(name);
+ STRING *chart_string = string_strdupz(chart);
+ STRING *name_string = string_strdupz(name);
+
+ uint32_t ret = 1;
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae ;ae = ae->next) {
- if (unlikely(
- ae->alarm_id == alarm_id && (!(ae->hash_name == hash_name && ae->hash_chart == hash_chart &&
- !strcmp(name, ae->name) && !strcmp(chart, ae->chart))))) {
- return 0;
+ if (unlikely(ae->alarm_id == alarm_id && (!(chart_string == ae->chart && name_string == ae->name)))) {
+ ret = 0;
+ break;
}
}
- return 1;
+
+ string_freez(chart_string);
+ string_freez(name_string);
+
+ return ret;
}
static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) {
@@ -177,6 +182,14 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
size_t line = 0, len = 0;
ssize_t loaded = 0, updated = 0, errored = 0, duplicate = 0;
+ DICTIONARY *all_rrdcalcs = dictionary_create(
+ DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE);
+ RRDCALC *rc;
+ foreach_rrdcalc_in_rrdhost_read(host, rc) {
+ dictionary_set(all_rrdcalcs, rrdcalc_name(rc), rc, sizeof(*rc));
+ }
+ foreach_rrdcalc_in_rrdhost_done(rc);
+
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
while((s = fgets_trim_len(buf, 65536, fp, &len))) {
@@ -192,7 +205,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
*s = '\0';
pointers[entries++] = ++s;
if(entries >= max_entries) {
- error("HEALTH [%s]: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", host->hostname, line, filename, max_entries);
+ error("HEALTH [%s]: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", rrdhost_hostname(host), line, filename, max_entries);
break;
}
}
@@ -206,7 +219,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
ALARM_ENTRY *ae = NULL;
if(entries < 27) {
- error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries);
+ error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", rrdhost_hostname(host), line, filename, entries);
errored++;
continue;
}
@@ -214,14 +227,14 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
// check that we have valid ids
uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16);
if(!unique_id) {
- error("HEALTH [%s]: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", host->hostname, line, filename, unique_id, pointers[2]);
+ error("HEALTH [%s]: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", rrdhost_hostname(host), line, filename, unique_id, pointers[2]);
errored++;
continue;
}
uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16);
if(!alarm_id) {
- error("HEALTH [%s]: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", host->hostname, line, filename, alarm_id, pointers[3]);
+ error("HEALTH [%s]: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", rrdhost_hostname(host), line, filename, alarm_id, pointers[3]);
errored++;
continue;
}
@@ -232,18 +245,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
char* alarm_name = pointers[13];
last_repeat = (time_t)strtoul(pointers[27], NULL, 16);
- RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
- if (!rc) {
- for(rc = host->alarms; rc ; rc = rc->next) {
- RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl_t *)rc);
- if(rdcmp != rc) {
- error("Cannot insert the alarm index ID using log %s", rc->name);
- }
- }
-
- rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
- }
-
+ rc = dictionary_get(all_rrdcalcs, alarm_name);
if(unlikely(rc)) {
if (rrdcalc_isrepeating(rc)) {
rc->last_repeat = last_repeat;
@@ -259,7 +261,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
// make sure it is properly numbered
if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it."
- , host->hostname, line, filename, unique_id);
+ , rrdhost_hostname(host), line, filename, unique_id);
errored++;
continue;
}
@@ -272,7 +274,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
if(unlikely(unique_id == ae->unique_id)) {
if(unlikely(*pointers[0] == 'A')) {
error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later."
- , host->hostname, line, filename, unique_id);
+ , rrdhost_hostname(host), line, filename, unique_id);
*pointers[0] = 'U';
duplicate++;
}
@@ -298,8 +300,13 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
// error("HEALTH [%s]: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", host->hostname, line, filename, pointers[1], host->hostname);
ae->unique_id = unique_id;
- if (!is_valid_alarm_id(host, pointers[14], pointers[13], alarm_id))
- alarm_id = rrdcalc_get_unique_id(host, pointers[14], pointers[13], NULL);
+ if (!is_valid_alarm_id(host, pointers[14], pointers[13], alarm_id)) {
+ STRING *chart = string_strdupz(pointers[14]);
+ STRING *name = string_strdupz(pointers[13]);
+ alarm_id = rrdcalc_get_unique_id(host, chart, name, NULL);
+ string_freez(chart);
+ string_freez(name);
+ }
ae->alarm_id = alarm_id;
ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16);
ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16);
@@ -315,36 +322,29 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
ae->exec_run_timestamp = (uint32_t)strtoul(pointers[11], NULL, 16);
ae->delay_up_to_timestamp = (uint32_t)strtoul(pointers[12], NULL, 16);
- freez(ae->name);
- ae->name = strdupz(pointers[13]);
- ae->hash_name = simple_hash(ae->name);
+ string_freez(ae->name);
+ ae->name = string_strdupz(pointers[13]);
- freez(ae->chart);
- ae->chart = strdupz(pointers[14]);
- ae->hash_chart = simple_hash(ae->chart);
+ string_freez(ae->chart);
+ ae->chart = string_strdupz(pointers[14]);
- freez(ae->family);
- ae->family = strdupz(pointers[15]);
+ string_freez(ae->family);
+ ae->family = string_strdupz(pointers[15]);
- freez(ae->exec);
- ae->exec = strdupz(pointers[16]);
- if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; }
+ string_freez(ae->exec);
+ ae->exec = string_strdupz(pointers[16]);
- freez(ae->recipient);
- ae->recipient = strdupz(pointers[17]);
- if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; }
+ string_freez(ae->recipient);
+ ae->recipient = string_strdupz(pointers[17]);
- freez(ae->source);
- ae->source = strdupz(pointers[18]);
- if(!*ae->source) { freez(ae->source); ae->source = NULL; }
+ string_freez(ae->source);
+ ae->source = string_strdupz(pointers[18]);
- freez(ae->units);
- ae->units = strdupz(pointers[19]);
- if(!*ae->units) { freez(ae->units); ae->units = NULL; }
+ string_freez(ae->units);
+ ae->units = string_strdupz(pointers[19]);
- freez(ae->info);
- ae->info = strdupz(pointers[20]);
- if(!*ae->info) { freez(ae->info); ae->info = NULL; }
+ string_freez(ae->info);
+ ae->info = string_strdupz(pointers[20]);
ae->exec_code = str2i(pointers[21]);
ae->new_status = str2i(pointers[22]);
@@ -357,24 +357,21 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
ae->last_repeat = last_repeat;
if (likely(entries > 30)) {
- freez(ae->classification);
- ae->classification = strdupz(pointers[28]);
- if(!*ae->classification) { freez(ae->classification); ae->classification = NULL; }
+ string_freez(ae->classification);
+ ae->classification = string_strdupz(pointers[28]);
- freez(ae->component);
- ae->component = strdupz(pointers[29]);
- if(!*ae->component) { freez(ae->component); ae->component = NULL; }
+ string_freez(ae->component);
+ ae->component = string_strdupz(pointers[29]);
- freez(ae->type);
- ae->type = strdupz(pointers[30]);
- if(!*ae->type) { freez(ae->type); ae->type = NULL; }
+ string_freez(ae->type);
+ ae->type = string_strdupz(pointers[30]);
}
char value_string[100 + 1];
- freez(ae->old_value_string);
- freez(ae->new_value_string);
- ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
- ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
+ string_freez(ae->old_value_string);
+ string_freez(ae->new_value_string);
+ ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1));
+ ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1));
// add it to host if not already there
if(unlikely(*pointers[0] == 'A')) {
@@ -395,13 +392,16 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
host->health_max_alarm_id = ae->alarm_id;
}
else {
- error("HEALTH [%s]: line %zu of file '%s' is invalid (unrecognized entry type '%s').", host->hostname, line, filename, pointers[0]);
+ error("HEALTH [%s]: line %zu of file '%s' is invalid (unrecognized entry type '%s').", rrdhost_hostname(host), line, filename, pointers[0]);
errored++;
}
}
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
+ dictionary_destroy(all_rrdcalcs);
+ all_rrdcalcs = NULL;
+
freez(buf);
if(!host->health_max_unique_id) host->health_max_unique_id = (uint32_t)now_realtime_sec();
@@ -411,7 +411,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
if (unlikely(!host->health_log.next_alarm_id || host->health_log.next_alarm_id <= host->health_max_alarm_id))
host->health_log.next_alarm_id = host->health_max_alarm_id + 1;
- debug(D_HEALTH, "HEALTH [%s]: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", host->hostname, filename, loaded, updated, errored, duplicate);
+ debug(D_HEALTH, "HEALTH [%s]: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", rrdhost_hostname(host), filename, loaded, updated, errored, duplicate);
return loaded;
}
@@ -422,7 +422,7 @@ inline void health_alarm_log_load(RRDHOST *host) {
snprintfz(filename, FILENAME_MAX, "%s.old", host->health_log_filename);
FILE *fp = fopen(filename, "r");
if(!fp)
- error("HEALTH [%s]: cannot open health file: %s", host->hostname, filename);
+ error("HEALTH [%s]: cannot open health file: %s", rrdhost_hostname(host), filename);
else {
health_alarm_log_read(host, fp, filename);
fclose(fp);
@@ -431,7 +431,7 @@ inline void health_alarm_log_load(RRDHOST *host) {
host->health_log_entries_written = 0;
fp = fopen(host->health_log_filename, "r");
if(!fp)
- error("HEALTH [%s]: cannot open health file: %s", host->hostname, host->health_log_filename);
+ error("HEALTH [%s]: cannot open health file: %s", rrdhost_hostname(host), host->health_log_filename);
else {
health_alarm_log_read(host, fp, host->health_log_filename);
fclose(fp);
@@ -443,63 +443,48 @@ inline void health_alarm_log_load(RRDHOST *host) {
// health alarm log management
inline ALARM_ENTRY* health_create_alarm_entry(
- RRDHOST *host,
- uint32_t alarm_id,
- uint32_t alarm_event_id,
- uuid_t config_hash_id,
- time_t when,
- const char *name,
- const char *chart,
- const char *chart_context,
- const char *family,
- const char *class,
- const char *component,
- const char *type,
- const char *exec,
- const char *recipient,
- time_t duration,
- NETDATA_DOUBLE old_value,
- NETDATA_DOUBLE new_value,
- RRDCALC_STATUS old_status,
- RRDCALC_STATUS new_status,
- const char *source,
- const char *units,
- const char *info,
- int delay,
- uint32_t flags
+ RRDHOST *host,
+ uint32_t alarm_id,
+ uint32_t alarm_event_id,
+ const uuid_t config_hash_id,
+ time_t when,
+ STRING *name,
+ STRING *chart,
+ STRING *chart_context,
+ STRING *family,
+ STRING *class,
+ STRING *component,
+ STRING *type,
+ STRING *exec,
+ STRING *recipient,
+ time_t duration,
+ NETDATA_DOUBLE old_value,
+ NETDATA_DOUBLE new_value,
+ RRDCALC_STATUS old_status,
+ RRDCALC_STATUS new_status,
+ STRING *source,
+ STRING *units,
+ STRING *info,
+ int delay,
+ uint32_t flags
) {
debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
- ae->name = strdupz(name);
- ae->hash_name = simple_hash(ae->name);
-
- if(chart) {
- ae->chart = strdupz(chart);
- ae->hash_chart = simple_hash(ae->chart);
- }
-
- if(chart_context)
- ae->chart_context = strdupz(chart_context);
+ ae->name = string_dup(name);
+ ae->chart = string_dup(chart);
+ ae->chart_context = string_dup(chart_context);
uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id));
- if(family)
- ae->family = strdupz(family);
-
- if (class)
- ae->classification = strdupz(class);
-
- if (component)
- ae->component = strdupz(component);
-
- if (type)
- ae->type = strdupz(type);
-
- if(exec) ae->exec = strdupz(exec);
- if(recipient) ae->recipient = strdupz(recipient);
- if(source) ae->source = strdupz(source);
- if(units) ae->units = strdupz(units);
+ ae->family = string_dup(family);
+ ae->classification = string_dup(class);
+ ae->component = string_dup(component);
+ ae->type = string_dup(type);
+ ae->exec = string_dup(exec);
+ ae->recipient = string_dup(recipient);
+ ae->source = string_dup(source);
+ ae->units = string_dup(units);
ae->unique_id = host->health_log.next_log_id++;
ae->alarm_id = alarm_id;
@@ -509,27 +494,10 @@ inline ALARM_ENTRY* health_create_alarm_entry(
ae->new_value = new_value;
char value_string[100 + 1];
- ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
- ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
-
- char *replaced_info = NULL;
- if (likely(info)) {
- char *m;
- replaced_info = strdupz(info);
- size_t pos = 0;
- while ((m = strstr(replaced_info + pos, "$family"))) {
- char *buf = NULL;
- pos = m - replaced_info;
- buf = find_and_replace(replaced_info, "$family", (ae->family) ? ae->family : "", m);
- freez(replaced_info);
- replaced_info = strdupz(buf);
- freez(buf);
- }
- }
-
- if(replaced_info) ae->info = strdupz(replaced_info);
- freez(replaced_info);
+ ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1));
+ ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1));
+ ae->info = string_dup(info);
ae->old_status = old_status;
ae->new_status = new_status;
ae->duration = duration;
@@ -545,7 +513,7 @@ inline ALARM_ENTRY* health_create_alarm_entry(
return ae;
}
-inline void health_alarm_log(
+inline void health_alarm_log_add_entry(
RRDHOST *host,
ALARM_ENTRY *ae
) {
@@ -585,26 +553,24 @@ inline void health_alarm_log(
}
inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
- freez(ae->name);
- freez(ae->chart);
- freez(ae->chart_context);
- freez(ae->family);
- freez(ae->classification);
- freez(ae->component);
- freez(ae->type);
- freez(ae->exec);
- freez(ae->recipient);
- freez(ae->source);
- freez(ae->units);
- freez(ae->info);
- freez(ae->old_value_string);
- freez(ae->new_value_string);
+ string_freez(ae->name);
+ string_freez(ae->chart);
+ string_freez(ae->chart_context);
+ string_freez(ae->family);
+ string_freez(ae->classification);
+ string_freez(ae->component);
+ string_freez(ae->type);
+ string_freez(ae->exec);
+ string_freez(ae->recipient);
+ string_freez(ae->source);
+ string_freez(ae->units);
+ string_freez(ae->info);
+ string_freez(ae->old_value_string);
+ string_freez(ae->new_value_string);
freez(ae);
}
inline void health_alarm_log_free(RRDHOST *host) {
- rrdhost_check_wrlock(host);
-
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *ae;
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 0dfecade5..3edf3d083 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -199,7 +199,7 @@ fi
[ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@"
[ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@"
[ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io"
-[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://app.netdata.cloud"
+[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://api.netdata.cloud"
# -----------------------------------------------------------------------------
# parse command line parameters
@@ -250,7 +250,7 @@ fi
# -----------------------------------------------------------------------------
# find a suitable hostname to use, if netdata did not supply a hostname
-if [ -z ${args_host} ]; then
+if [ -z "${args_host}" ]; then
this_host=$(hostname -s 2>/dev/null)
host="${this_host}"
args_host="${this_host}"
@@ -428,6 +428,10 @@ else
done
fi
+if [[ ! $curl_options =~ .*\--connect-timeout ]]; then
+ curl_options+=" --connect-timeout 5"
+fi
+
OPSGENIE_API_URL=${OPSGENIE_API_URL:-"https://api.opsgenie.com"}
# If we didn't autodetect the character set for e-mail and it wasn't
@@ -1335,21 +1339,37 @@ send_telegram() {
if [ "${SEND_TELEGRAM}" = "YES" ] && [ -n "${bottoken}" ] && [ -n "${chatids}" ] && [ -n "${message}" ]; then
for chatid in ${chatids}; do
- # https://core.telegram.org/bots/api#sendmessage
- httpcode=$(docurl ${disableNotification} \
- --data-urlencode "parse_mode=HTML" \
- --data-urlencode "disable_web_page_preview=true" \
- --data-urlencode "text=${emoji} ${message}" \
- "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}")
-
- if [ "${httpcode}" = "200" ]; then
- info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'"
- sent=$((sent + 1))
- elif [ "${httpcode}" = "401" ]; then
- error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token."
- else
- error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP response status code ${httpcode}."
- fi
+ notify_telegram=1
+ notify_retries=${TELEGRAM_RETRIES_ON_LIMIT:-0}
+
+ while [ ${notify_telegram} -eq 1 ]; do
+ # https://core.telegram.org/bots/api#sendmessage
+ httpcode=$(docurl ${disableNotification} \
+ --data-urlencode "parse_mode=HTML" \
+ --data-urlencode "disable_web_page_preview=true" \
+ --data-urlencode "text=${emoji} ${message}" \
+ "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}")
+
+ notify_telegram=0
+
+ if [ "${httpcode}" = "200" ]; then
+ info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'"
+ sent=$((sent + 1))
+ elif [ "${httpcode}" = "401" ]; then
+ error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token."
+ elif [ "${httpcode}" = "429" ]; then
+ if [ "$notify_retries" -gt 0 ]; then
+ error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': rate limit exceeded, retrying after 1s."
+ notify_retries=$((notify_retries - 1))
+ notify_telegram=1
+ sleep 1
+ else
+ error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': rate limit exceeded."
+ fi
+ else
+ error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP response status code ${httpcode}."
+ fi
+ done
done
[ ${sent} -gt 0 ] && return 0
@@ -2398,7 +2418,7 @@ status_email_subject="${status}"
case "${status}" in
CRITICAL)
image="${images_base_url}/images/alert-128-red.png"
- alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_critical.png"
+ alarm_badge="https://app.netdata.cloud/static/email/img/label_critical.png"
status_message="is critical"
status_email_subject="Critical"
color="#ca414b"
@@ -2411,7 +2431,7 @@ CRITICAL)
WARNING)
image="${images_base_url}/images/alert-128-orange.png"
- alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_warning.png"
+ alarm_badge="https://app.netdata.cloud/static/email/img/label_warning.png"
status_message="needs attention"
status_email_subject="Warning"
color="#ffc107"
@@ -2424,7 +2444,7 @@ WARNING)
CLEAR)
image="${images_base_url}/images/check-mark-2-128-green.png"
- alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_recovered.png"
+ alarm_badge="https://app.netdata.cloud/static/email/img/label_recovered.png"
status_message="recovered"
status_email_subject="Clear"
color="#77ca6d"
diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf
index b69c6d538..52de86645 100755
--- a/health/notifications/health_alarm_notify.conf
+++ b/health/notifications/health_alarm_notify.conf
@@ -443,6 +443,11 @@ SEND_TELEGRAM="YES"
# Without it, netdata cannot send telegram messages.
TELEGRAM_BOT_TOKEN=""
+# If an API limit error is returned on sending a message, Netdata will retry this number of times before giving up.
+# Setting the number to 0 makes Netdata do no retries (which is the default).
+# See https://core.telegram.org/bots/faq#my-bot-is-hitting-limits-how-do-i-avoid-this
+TELEGRAM_RETRIES_ON_LIMIT="0"
+
# To get your chat ID send the command /getid to telegram bot @myidbot
# (https://t.me/myidbot). Each user also needs to open a conversation with the
# bot that will be sending notifications.