diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-11-30 18:47:00 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-11-30 18:47:00 +0000 |
commit | 03bf87dcb06f7021bfb2df2fa8691593c6148aff (patch) | |
tree | e16b06711a2ed77cafb4b7754be0220c3d14a9d7 /health | |
parent | Adding upstream version 1.36.1. (diff) | |
download | netdata-upstream/1.37.0.tar.xz netdata-upstream/1.37.0.zip |
Adding upstream version 1.37.0.upstream/1.37.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 3 | ||||
-rw-r--r-- | health/REFERENCE.md | 38 | ||||
-rw-r--r-- | health/health.c | 1431 | ||||
-rw-r--r-- | health/health.d/dns_query.conf | 17 | ||||
-rw-r--r-- | health/health.d/go.d.plugin.conf | 2 | ||||
-rw-r--r-- | health/health.d/ml.conf | 21 | ||||
-rw-r--r-- | health/health.d/mysql.conf | 34 | ||||
-rw-r--r-- | health/health.d/nvme.conf | 15 | ||||
-rw-r--r-- | health/health.d/pihole.conf | 23 | ||||
-rw-r--r-- | health/health.d/ping.conf | 50 | ||||
-rw-r--r-- | health/health.d/postgres.conf | 214 | ||||
-rw-r--r-- | health/health.d/python.d.plugin.conf | 2 | ||||
-rw-r--r-- | health/health.d/redis.conf | 29 | ||||
-rw-r--r-- | health/health.d/systemdunits.conf | 105 | ||||
-rw-r--r-- | health/health.d/tcp_resets.conf | 4 | ||||
-rw-r--r-- | health/health.d/timex.conf | 2 | ||||
-rw-r--r-- | health/health.h | 116 | ||||
-rw-r--r-- | health/health_config.c | 705 | ||||
-rw-r--r-- | health/health_json.c | 157 | ||||
-rw-r--r-- | health/health_log.c | 316 | ||||
-rwxr-xr-x | health/notifications/alarm-notify.sh.in | 60 | ||||
-rwxr-xr-x | health/notifications/health_alarm_notify.conf | 5 |
22 files changed, 1958 insertions, 1391 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index 777b35858..7c8d7f9d2 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -65,8 +65,11 @@ dist_healthconfig_DATA = \ health.d/mysql.conf \ health.d/net.conf \ health.d/netfilter.conf \ + health.d/nvme.conf \ health.d/nut.conf \ health.d/pihole.conf \ + health.d/ping.conf \ + health.d/postgres.conf \ health.d/portcheck.conf \ health.d/processes.conf \ health.d/python.d.plugin.conf \ diff --git a/health/REFERENCE.md b/health/REFERENCE.md index d1af74767..90da4102a 100644 --- a/health/REFERENCE.md +++ b/health/REFERENCE.md @@ -536,12 +536,48 @@ See our [simple patterns docs](/libnetdata/simple_pattern/README.md) for more ex #### Alarm line `info` -The info field can contain a small piece of text describing the alarm or template. This will be rendered in notifications and UI elements whenever the specific alarm is in focus. An example for the `ram_available` alarm is: +The info field can contain a small piece of text describing the alarm or template. This will be rendered in +notifications and UI elements whenever the specific alarm is in focus. An example for the `ram_available` alarm is: ```yaml info: percentage of estimated amount of RAM available for userspace processes, without causing swapping ``` +info fields can contain special variables in their text that will be replaced during run-time to provide more specific +alert information. Current variables supported are: + +| variable | description | +| ---------| ----------- | +| $family | Will be replaced by the family instance for the alert (e.g. eth0) | +| $label: | Followed by a chart label name, this will replace the variable with the chart label's value | + +For example, an info field like the following: + +```yaml +info: average inbound utilization for the network interface $family over the last minute +``` + +Will be rendered on the alert acting on interface `eth0` as: + +```yaml +info: average inbound utilization for the network interface eth0 over the last minute +``` + +An alert acting on a chart that has a chart label named e.g. `target`, with a value of `https://netdata.cloud/`, +can be enriched as follows: + +```yaml +info: average ratio of HTTP responses with unexpected status over the last 5 minutes for the site $label:target +``` + +Will become: + +```yaml +info: average ratio of HTTP responses with unexpected status over the last 5 minutes for the site https://netdata.cloud/ +``` + +> Please note that variable names are case sensitive. + ## Expressions Netdata has an internal [infix expression parser](/libnetdata/eval). This parses expressions and creates an internal diff --git a/health/health.c b/health/health.c index 9eb36a9c6..3784e0f31 100644 --- a/health/health.c +++ b/health/health.c @@ -2,11 +2,166 @@ #include "health.h" +#define WORKER_HEALTH_JOB_RRD_LOCK 0 +#define WORKER_HEALTH_JOB_HOST_LOCK 1 +#define WORKER_HEALTH_JOB_DB_QUERY 2 +#define WORKER_HEALTH_JOB_CALC_EVAL 3 +#define WORKER_HEALTH_JOB_WARNING_EVAL 4 +#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 +#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 +#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 +#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET 8 +#define WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM 9 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 10 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 10 +#endif + +static bool prepare_command(BUFFER *wb, + const char *exec, + const char *recipient, + const char *registry_hostname, + uint32_t unique_id, + uint32_t alarm_id, + uint32_t alarm_event_id, + uint32_t when, + const char *alert_name, + const char *alert_chart_name, + const char *alert_family, + const char *new_status, + const char *old_status, + NETDATA_DOUBLE new_value, + NETDATA_DOUBLE old_value, + const char *alert_source, + uint32_t duration, + uint32_t non_clear_duration, + const char *alert_units, + const char *alert_info, + const char *new_value_string, + const char *old_value_string, + const char *source, + const char *error_msg, + int n_warn, + int n_crit, + const char *warn_alarms, + const char *crit_alarms, + const char *classification, + const char *edit_command, + const char *machine_guid) +{ + char buf[8192]; + size_t n = 8192 - 1; + + buffer_strcat(wb, "exec"); + + if (!sanitize_command_argument_string(buf, exec, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, recipient, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, registry_hostname, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%u'", unique_id); + + buffer_sprintf(wb, " '%u'", alarm_id); + + buffer_sprintf(wb, " '%u'", alarm_event_id); + + buffer_sprintf(wb, " '%u'", when); + + if (!sanitize_command_argument_string(buf, alert_name, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_chart_name, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_family, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, new_status, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, old_status, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value); + + buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value); + + if (!sanitize_command_argument_string(buf, alert_source, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%u'", duration); + + buffer_sprintf(wb, " '%u'", non_clear_duration); + + if (!sanitize_command_argument_string(buf, alert_units, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, alert_info, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, new_value_string, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, old_value_string, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, source, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, error_msg, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + buffer_sprintf(wb, " '%d'", n_warn); + + buffer_sprintf(wb, " '%d'", n_crit); + + if (!sanitize_command_argument_string(buf, warn_alarms, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, crit_alarms, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, classification, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, edit_command, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + if (!sanitize_command_argument_string(buf, machine_guid, n)) + return false; + buffer_sprintf(wb, " '%s'", buf); + + return true; +} + unsigned int default_health_enabled = 1; char *silencers_filename; // the queue of executed alarm notifications that haven't been waited for yet -static struct { +static __thread struct { ALARM_ENTRY *head; // oldest ALARM_ENTRY *tail; // latest } alarm_notifications_in_progress = {NULL, NULL}; @@ -146,77 +301,51 @@ void health_init(void) { * @param host the structure of the host that the function will reload the configuration. */ static void health_reload_host(RRDHOST *host) { - if(unlikely(!host->health_enabled)) + if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; + log_health("[%s]: Reloading health.", rrdhost_hostname(host)); + char *user_path = health_user_config_dir(); char *stock_path = health_stock_config_dir(); // free all running alarms - rrdhost_wrlock(host); - - while(host->templates) - rrdcalctemplate_unlink_and_free(host, host->templates); - - RRDCALCTEMPLATE *rt,*next; - for(rt = host->alarms_template_with_foreach; rt ; rt = next) { - next = rt->next; - rrdcalctemplate_free(rt); - } - host->alarms_template_with_foreach = NULL; - - while(host->alarms) - rrdcalc_unlink_and_free(host, host->alarms); - - RRDCALC *rc,*nc; - for(rc = host->alarms_with_foreach; rc ; rc = nc) { - nc = rc->next; - rrdcalc_free(rc); - } - host->alarms_with_foreach = NULL; - - rrdhost_unlock(host); + rrdcalc_delete_all(host); + rrdcalctemplate_delete_all(host); // invalidate all previous entries in the alarm log + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *t; for(t = host->health_log.alarms ; t ; t = t->next) { if(t->new_status != RRDCALC_STATUS_REMOVED) t->flags |= HEALTH_ENTRY_FLAG_UPDATED; } + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - rrdhost_rdlock(host); // reset all thresholds to all charts RRDSET *st; rrdset_foreach_read(st, host) { st->green = NAN; st->red = NAN; } - rrdhost_unlock(host); + rrdset_foreach_done(st); // load the new alarms - rrdhost_wrlock(host); health_readdir(host, user_path, stock_path, NULL); //Discard alarms with labels that do not apply to host - rrdcalc_labels_unlink_alarm_from_host(host); + rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); // link the loaded alarms to their charts - RRDDIM *rd; rrdset_foreach_write(st, host) { if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) continue; - rrdsetcalc_link_matching(st); - rrdcalctemplate_link_matching(st); - //This loop must be the last, because ` rrdcalctemplate_link_matching` will create alarms related to it. - rrdset_rdlock(st); - rrddim_foreach_read(rd, st) { - rrdcalc_link_to_rrddim(rd, st, host); - } - rrdset_unlock(st); + rrdcalc_link_matching_alerts_to_rrdset(st); + rrdcalctemplate_link_matching_templates_to_rrdset(st); } - - rrdhost_unlock(host); + rrdset_foreach_done(st); + host->aclk_alert_reloaded = 1; } /** @@ -234,11 +363,6 @@ void health_reload(void) { health_reload_host(host); rrd_unlock(); -#ifdef ENABLE_ACLK - if (netdata_cloud_setting) { - aclk_alert_reloaded = 1; - } -#endif } // ---------------------------------------------------------------------------- @@ -250,7 +374,6 @@ static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) { return RRDCALC_STATUS_CLEAR; } -#define ALARM_EXEC_COMMAND_LENGTH 8192 #define ACTIVE_ALARMS_LIST_EXAMINE 500 #define ACTIVE_ALARMS_LIST 15 @@ -266,13 +389,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) { // do not send notifications for internal statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) { // do not send notifications for disabled statuses - debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); + log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); // mark it as run, so that we will send the same alarm if it happens again goto done; } @@ -292,7 +416,9 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // we have executed this alarm notification in the past if(t && t->new_status == ae->new_status) { // don't send the notification for the same status again - debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae->chart, ae->name + debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae) + , rrdcalc_status2string(ae->new_status)); + log_health("[%s]: Health not sending again notification for alarm '%s.%s' status %s", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae) , rrdcalc_status2string(ae->new_status)); goto done; } @@ -303,7 +429,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" - , ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + , ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } } @@ -312,14 +438,14 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // Check if alarm notifications are silenced if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) { - info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + log_health("[%s]: Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); goto done; } - static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1]; + log_health("[%s]: Sending notification for alarm '%s.%s' status %s.", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status)); - const char *exec = (ae->exec) ? ae->exec : host->health_default_exec; - const char *recipient = (ae->recipient) ? ae->recipient : host->health_default_recipient; + const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec); + const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient); int n_warn=0, n_crit=0; RRDCALC *rc; @@ -330,13 +456,16 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); - for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) { + foreach_rrdcalc_in_rrdhost_read(host, rc) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE)) + break; + if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc); active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; active_alerts[n_warn+n_crit].status = rc->status; n_warn++; @@ -344,7 +473,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc); active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; active_alerts[n_warn+n_crit].status = rc->status; n_crit++; @@ -355,6 +484,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { expr = rc->warning; } } + foreach_rrdcalc_in_rrdhost_done(rc); if (n_warn+n_crit>1) qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts); @@ -379,51 +509,55 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } - char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); - - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO - "' '" NETDATA_DOUBLE_FORMAT_ZERO - "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", - exec, - recipient, - host->registry_hostname, - ae->unique_id, - ae->alarm_id, - ae->alarm_event_id, - (unsigned long)ae->when, - ae->name, - ae->chart?ae->chart:"NOCHART", - ae->family?ae->family:"NOFAMILY", - rrdcalc_status2string(ae->new_status), - rrdcalc_status2string(ae->old_status), - ae->new_value, - ae->old_value, - ae->source?ae->source:"UNKNOWN", - (uint32_t)ae->duration, - (uint32_t)ae->non_clear_duration, - ae->units?ae->units:"", - ae->info?ae->info:"", - ae->new_value_string, - ae->old_value_string, - (expr && expr->source)?expr->source:"NOSOURCE", - (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", - n_warn, - n_crit, - buffer_tostring(warn_alarms), - buffer_tostring(crit_alarms), - ae->classification?ae->classification:"Unknown", - edit_command, - host != localhost ? host->machine_guid:"" - ); - - ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; - ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ - - debug(D_HEALTH, "executing command '%s'", command_to_run); - ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; - ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); - enqueue_alarm_notify_in_progress(ae); + char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); + + BUFFER *wb = buffer_create(8192); + bool ok = prepare_command(wb, + exec, + recipient, + rrdhost_registry_hostname(host), + ae->unique_id, + ae->alarm_id, + ae->alarm_event_id, + (unsigned long)ae->when, + ae_name(ae), + ae->chart?ae_chart_name(ae):"NOCHART", + ae->family?ae_family(ae):"NOFAMILY", + rrdcalc_status2string(ae->new_status), + rrdcalc_status2string(ae->old_status), + ae->new_value, + ae->old_value, + ae->source?ae_source(ae):"UNKNOWN", + (uint32_t)ae->duration, + (uint32_t)ae->non_clear_duration, + ae_units(ae), + ae_info(ae), + ae_new_value_string(ae), + ae_old_value_string(ae), + (expr && expr->source)?expr->source:"NOSOURCE", + (expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG", + n_warn, + n_crit, + buffer_tostring(warn_alarms), + buffer_tostring(crit_alarms), + ae->classification?ae_classification(ae):"Unknown", + edit_command, + host != localhost ? host->machine_guid:""); + + const char *command_to_run = buffer_tostring(wb); + if (ok) { + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN; + ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */ + + debug(D_HEALTH, "executing command '%s'", command_to_run); + ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS; + ae->exec_spawn_serial = spawn_enq_cmd(command_to_run); + enqueue_alarm_notify_in_progress(ae); + } else { + error("Failed to format command arguments"); + } + buffer_free(wb); freez(edit_command); buffer_free(warn_alarms); buffer_free(crit_alarms); @@ -450,7 +584,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", - ae->chart?ae->chart:"NOCHART", ae->name, + ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae), ae->new_value, rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status) @@ -467,7 +601,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { - if(likely(!alarm_entry_isrepeating(host, ae))) { + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { if(unlikely( !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) @@ -481,13 +615,13 @@ static inline void health_alarm_log_process(RRDHOST *host) { } } + netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + // remember this for the next iteration host->health_last_processed_id = first_waiting; bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max; - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); - if (!cleanup_excess_log_entries) return; @@ -508,7 +642,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; - if(likely(!alarm_entry_isrepeating(host, ae))) { + if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) { health_alarm_wait_for_execution(ae); health_alarm_log_free_one_nochecks_nounlink(ae); host->health_log.count--; @@ -522,7 +656,7 @@ static inline void health_alarm_log_process(RRDHOST *host) { static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) { if(unlikely(!rc->rrdset)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } @@ -533,40 +667,38 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) *next_run = rc->next_update; } - debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rc->chart?rc->chart:"NOCHART", rc->name, (int) (rc->next_update - now)); + debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now)); return 0; } if(unlikely(!rc->update_every)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) { - debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc)); return 0; } int update_every = rc->rrdset->update_every; - rrdset_rdlock(rc->rrdset); - time_t first = rrdset_first_entry_t_nolock(rc->rrdset); - time_t last = rrdset_last_entry_t_nolock(rc->rrdset); - rrdset_unlock(rc->rrdset); + time_t first = rrdset_first_entry_t(rc->rrdset); + time_t last = rrdset_last_entry_t(rc->rrdset); if(unlikely(now + update_every < first /* || now - update_every > last */)) { debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)." - , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) now, (unsigned long) first + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first , (unsigned long) last); return 0; } @@ -577,7 +709,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) if(needed + update_every < first || needed - update_every > last) { debug(D_HEALTH , "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)." - , rc->chart ? rc->chart : "NOCHART", rc->name, (unsigned long) needed, (unsigned long) first + , rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first , (unsigned long) last); return 0; } @@ -587,7 +719,7 @@ static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) } static inline int check_if_resumed_from_suspension(void) { - static usec_t last_realtime = 0, last_monotonic = 0; + static __thread usec_t last_realtime = 0, last_monotonic = 0; usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec(); int ret = 0; @@ -603,41 +735,142 @@ static inline int check_if_resumed_from_suspension(void) { return ret; } -static void health_main_cleanup(void *ptr) { +static void health_thread_cleanup(void *ptr) { worker_unregister(); - struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr; - static_thread->enabled = NETDATA_MAIN_THREAD_EXITING; + struct health_state *h = ptr; + h->host->health_spawn = 0; + + netdata_thread_cancel(netdata_thread_self()); + log_health("[%s]: Health thread ended.", rrdhost_hostname(h->host)); + debug(D_HEALTH, "HEALTH %s: Health thread ended.", rrdhost_hostname(h->host)); +} + +static void initialize_health(RRDHOST *host, int is_localhost) { + if(!host->health_enabled || rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) return; + rrdhost_flag_set(host, RRDHOST_FLAG_INITIALIZED_HEALTH); + + log_health("[%s]: Initializing health.", rrdhost_hostname(host)); + + host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never"); + host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never"); + + host->health_log.next_log_id = 1; + host->health_log.next_alarm_id = 1; + host->health_log.max = 1000; + host->health_log.next_log_id = (uint32_t)now_realtime_sec(); + host->health_log.next_alarm_id = 0; + + long n = config_get_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", host->health_log.max); + if(n < 10) { + error("Host '%s': health configuration has invalid max log entries %ld. Using default %u", rrdhost_hostname(host), n, host->health_log.max); + config_set_number(CONFIG_SECTION_HEALTH, "in memory max health log entries", (long)host->health_log.max); + } + else + host->health_log.max = (unsigned int)n; + + netdata_rwlock_init(&host->health_log.alarm_log_rwlock); + + char filename[FILENAME_MAX + 1]; + + if(!is_localhost) { + int r = mkdir(host->varlib_dir, 0775); + if (r != 0 && errno != EEXIST) + error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), host->varlib_dir); + } + + { + snprintfz(filename, FILENAME_MAX, "%s/health", host->varlib_dir); + int r = mkdir(filename, 0775); + if(r != 0 && errno != EEXIST) + error("Host '%s': cannot create directory '%s'", rrdhost_hostname(host), filename); + } + snprintfz(filename, FILENAME_MAX, "%s/health/health-log.db", host->varlib_dir); + host->health_log_filename = strdupz(filename); + + snprintfz(filename, FILENAME_MAX, "%s/alarm-notify.sh", netdata_configured_primary_plugins_dir); + host->health_default_exec = string_strdupz(config_get(CONFIG_SECTION_HEALTH, "script to execute on alarm", filename)); + host->health_default_recipient = string_strdupz("root"); + + if (!file_is_migrated(host->health_log_filename)) { + int rc = sql_create_health_log_table(host); + if (unlikely(rc)) { + log_health("[%s]: Failed to create health log table in the database", rrdhost_hostname(host)); + health_alarm_log_load(host); + health_alarm_log_open(host); + } + else { + health_alarm_log_load(host); + add_migrated_file(host->health_log_filename, 0); + } + } else { + // TODO: This needs to go to the metadata thread + // Health should wait before accessing the table (needs to be created by the metadata thread) + sql_create_health_log_table(host); + sql_health_alarm_log_load(host); + } + + // ------------------------------------------------------------------------ + // load health configuration + + health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL); - info("cleaning up..."); + // link the loaded alarms to their charts + RRDSET *st; + rrdset_foreach_write(st, host) { + if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED)) + continue; + + rrdcalc_link_matching_alerts_to_rrdset(st); + rrdcalctemplate_link_matching_templates_to_rrdset(st); + } + rrdset_foreach_done(st); - static_thread->enabled = NETDATA_MAIN_THREAD_EXITED; + //Discard alarms with labels that do not apply to host + rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); + + health_silencers_init(); +} + +static void health_sleep(time_t next_run, unsigned int loop __maybe_unused, RRDHOST *host) { + time_t now = now_realtime_sec(); + if(now < next_run) { + worker_is_idle(); + debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); + while (now < next_run && host->health_enabled && !netdata_exit) { + sleep_usec(USEC_PER_SEC); + now = now_realtime_sec(); + } + } + else { + debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + } } -static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { +static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) { SILENCER *s; debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s", - rc->name, (rc->rrdset)?rc->rrdset->context:"", rc->chart, host, (rc->rrdset)?rc->rrdset->family:""); + rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):""); for (s = silencers->silencers; s!=NULL; s=s->next){ if ( - (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern,rc->name))) && - (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern,rc->rrdset->context))) && + (!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) && + (!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) && (!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) && - (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern,rc->chart))) && - (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern,rc->rrdset->family))) + (!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) && + (!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset)))) ) { debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families); if (unlikely(silencers->stype == STYPE_NONE)) { - debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); + debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc)); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" - , rc->name - , (rc->rrdset)?rc->rrdset->context:"" - , rc->chart + , rrdcalc_name(rc) + , (rc->rrdset)?rrdset_context(rc->rrdset):"" + , rrdcalc_chart_name(rc) , host - , (rc->rrdset)?rc->rrdset->family:"" + , (rc->rrdset)?rrdset_family(rc->rrdset):"" ); } return silencers->stype; @@ -657,66 +890,86 @@ static SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise */ static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { - uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; + uint32_t rrdcalc_flags_old = rc->run_flags; // Clear the flags - rc->rrdcalc_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); + rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED); if (unlikely(silencers->all_alarms)) { - if (silencers->stype == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; - else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED; + else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED; } else { - SILENCE_TYPE st = check_silenced(rc, host->hostname, silencers); - if (st == STYPE_DISABLE_ALARMS) rc->rrdcalc_flags |= RRDCALC_FLAG_DISABLED; - else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->rrdcalc_flags |= RRDCALC_FLAG_SILENCED; + SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers); + if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED; + else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED; } - if (rrdcalc_flags_old != rc->rrdcalc_flags) { + if (rrdcalc_flags_old != rc->run_flags) { info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s", - host->hostname, - rc->name, + rrdhost_hostname(host), + rrdcalc_name(rc), (rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false", - (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false", + (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false", (rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false", - (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false" + (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false" ); } - if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) + if (rc->run_flags & RRDCALC_FLAG_DISABLED) return 1; else return 0; } -// Create alarms for dimensions that have been added to charts -// since the previous iteration. -static void init_pending_foreach_alarms(RRDHOST *host) { +static void health_execute_delayed_initializations(RRDHOST *host) { RRDSET *st; - RRDDIM *rd; - if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS)) - return; + if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION)) return; + rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_HEALTH_INITIALIZATION); - rrdhost_wrlock(host); + rrdset_foreach_reentrant(st, host) { + if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION)) continue; + rrdset_flag_clear(st, RRDSET_FLAG_PENDING_HEALTH_INITIALIZATION); - rrdset_foreach_write(st, host) { - if (!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS)) - continue; + worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET); + + if(!st->rrdfamily) + st->rrdfamily = rrdfamily_add_and_acquire(host, rrdset_family(st)); + + if(!st->rrdvars) + st->rrdvars = rrdvariables_create(); + + rrddimvar_index_init(st); - rrdset_rdlock(st); + rrdsetvar_add_and_leave_released(st, "last_collected_t", RRDVAR_TYPE_TIME_T, &st->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "green", RRDVAR_TYPE_CALCULATED, &st->green, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "red", RRDVAR_TYPE_CALCULATED, &st->red, RRDVAR_FLAG_NONE); + rrdsetvar_add_and_leave_released(st, "update_every", RRDVAR_TYPE_INT, &st->update_every, RRDVAR_FLAG_NONE); + rrdcalc_link_matching_alerts_to_rrdset(st); + rrdcalctemplate_link_matching_templates_to_rrdset(st); + + RRDDIM *rd; rrddim_foreach_read(rd, st) { - if (!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM)) - continue; + if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION)) continue; + rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_HEALTH_INITIALIZATION); - rrdcalc_link_to_rrddim(rd, st, host); + worker_is_busy(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM); - rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARM); - } + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_CALCULATED, NULL, NULL, &rd->last_stored_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_COLLECTED, NULL, "_raw", &rd->last_collected_value, RRDVAR_FLAG_NONE); + rrddimvar_add_and_leave_released(rd, RRDVAR_TYPE_TIME_T, NULL, "_last_collected_t", &rd->last_collected_time.tv_sec, RRDVAR_FLAG_NONE); - rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS); - rrdset_unlock(st); - } + RRDCALCTEMPLATE *rt; + foreach_rrdcalctemplate_read(host, rt) { + if(!rt->foreach_dimension_pattern) + continue; - rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS); - rrdhost_unlock(host); + if(rrdcalctemplate_check_rrdset_conditions(rt, st, host)) + rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host); + } + foreach_rrdcalctemplate_done(rt); + } + rrddim_foreach_done(rd); + } + rrdset_foreach_done(st); } /** @@ -729,19 +982,6 @@ static void init_pending_foreach_alarms(RRDHOST *host) { * @return It always returns NULL */ -#define WORKER_HEALTH_JOB_RRD_LOCK 0 -#define WORKER_HEALTH_JOB_HOST_LOCK 1 -#define WORKER_HEALTH_JOB_DB_QUERY 2 -#define WORKER_HEALTH_JOB_CALC_EVAL 3 -#define WORKER_HEALTH_JOB_WARNING_EVAL 4 -#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5 -#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6 -#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7 - -#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8 -#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8 -#endif - void *health_main(void *ptr) { worker_register("HEALTH"); worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock"); @@ -752,8 +992,14 @@ void *health_main(void *ptr) { worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval"); worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry"); worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process"); + worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDSET, "rrdset init"); + worker_register_job_name(WORKER_HEALTH_JOB_DELAYED_INIT_RRDDIM, "rrddim init"); - netdata_thread_cleanup_push(health_main_cleanup, ptr); + struct health_state *h = ptr; + netdata_thread_cleanup_push(health_thread_cleanup, ptr); + + RRDHOST *host = h->host; + initialize_health(host, host == localhost); int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10); if(min_run_every < 1) min_run_every = 1; @@ -763,16 +1009,21 @@ void *health_main(void *ptr) { time_t now = now_realtime_sec(); time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); - rrdcalc_labels_unlink(); + bool health_running_logged = false; + + rrdhost_rdlock(host); //CHECK + rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host); + rrdhost_unlock(host); unsigned int loop = 0; #ifdef ENABLE_ACLK unsigned int marked_aclk_reload_loop = 0; #endif - while(!netdata_exit) { + while(!netdata_exit && host->health_enabled) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); + now = now_realtime_sec(); int runnable = 0, apply_hibernation_delay = 0; time_t next_run = now + min_run_every; RRDCALC *rc; @@ -780,433 +1031,500 @@ void *health_main(void *ptr) { if (unlikely(check_if_resumed_from_suspension())) { apply_hibernation_delay = 1; - info( - "Postponing alarm checks for %"PRId64" seconds, " - "because it seems that the system was just resumed from suspension.", - (int64_t)hibernation_delay); + log_health( + "[%s]: Postponing alarm checks for %"PRId64" seconds, " + "because it seems that the system was just resumed from suspension.", + rrdhost_hostname(host), + (int64_t)hibernation_delay); } if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) { - static int logged=0; + static __thread int logged=0; if (!logged) { - info("Skipping health checks, because all alarms are disabled via a %s command.", - HEALTH_CMDAPI_CMD_DISABLEALL); + log_health("[%s]: Skipping health checks, because all alarms are disabled via a %s command.", + rrdhost_hostname(host), + HEALTH_CMDAPI_CMD_DISABLEALL); logged = 1; } } #ifdef ENABLE_ACLK - if (aclk_alert_reloaded && !marked_aclk_reload_loop) + if (host->aclk_alert_reloaded && !marked_aclk_reload_loop) marked_aclk_reload_loop = loop; #endif - worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK); - rrd_rdlock(); + if (unlikely(apply_hibernation_delay)) { + log_health( + "[%s]: Postponing health checks for %"PRId64" seconds.", + rrdhost_hostname(host), + (int64_t)hibernation_delay); - RRDHOST *host; - rrdhost_foreach_read(host) { - if (unlikely(!host->health_enabled)) + host->health_delay_up_to = now + hibernation_delay; + next_run = now + hibernation_delay; + health_sleep(next_run, loop, host); + } + + if (unlikely(host->health_delay_up_to)) { + if (unlikely(now < host->health_delay_up_to)) { + next_run = host->health_delay_up_to; + health_sleep(next_run, loop, host); continue; + } - if (unlikely(apply_hibernation_delay)) { - info( - "Postponing health checks for %"PRId64" seconds, on host '%s'.", - (int64_t)hibernation_delay, - host->hostname); + log_health("[%s]: Resuming health checks after delay.", rrdhost_hostname(host)); + host->health_delay_up_to = 0; + } - host->health_delay_up_to = now + hibernation_delay; + // wait until cleanup of obsolete charts on children is complete + if (host != localhost) { + if (unlikely(host->trigger_chart_obsoletion_check == 1)) { + log_health("[%s]: Waiting for chart obsoletion check.", rrdhost_hostname(host)); + health_sleep(next_run, loop, host); + continue; } + } - if (unlikely(host->health_delay_up_to)) { - if (unlikely(now < host->health_delay_up_to)) - continue; + if (!health_running_logged) { + log_health("[%s]: Health is running.", rrdhost_hostname(host)); + health_running_logged = true; + } - info("Resuming health checks on host '%s'.", host->hostname); - host->health_delay_up_to = 0; - } + if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) + sql_health_alarm_log_cleanup(host); - // wait until cleanup of obsolete charts on children is complete - if (host != localhost) - if (unlikely(host->trigger_chart_obsoletion_check == 1)) - continue; + health_execute_delayed_initializations(host); - if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) - sql_health_alarm_log_cleanup(host); + worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); - init_pending_foreach_alarms(host); + // the first loop is to lookup values from the db + foreach_rrdcalc_in_rrdhost_read(host, rc) { - worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK); - rrdhost_rdlock(host); + rrdcalc_update_info_using_rrdset_labels(rc); - // the first loop is to lookup values from the db - for (rc = host->alarms; rc; rc = rc->next) { + if (update_disabled_silenced(host, rc)) + continue; - if (update_disabled_silenced(host, rc)) - continue; + // create an alert removed event if the chart is obsolete and + // has stopped being collected for 60 seconds + if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED && + rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && + now > (rc->rrdset->last_collected_time.tv_sec + 60))) { + if (!rrdcalc_isrepeating(rc)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + time_t now = now_realtime_sec(); + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->value, + NAN, + rc->status, + RRDCALC_STATUS_REMOVED, + rc->source, + rc->units, + rc->info, + 0, + rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0); + + if (ae) { + health_alarm_log_add_entry(host, ae); + rc->old_status = rc->status; + rc->status = RRDCALC_STATUS_REMOVED; + rc->last_status_change = now; + rc->last_updated = now; + rc->value = NAN; - // create an alert removed event if the chart is obsolete and - // has stopped being collected for 60 seconds - if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED && - rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) && - now > (rc->rrdset->last_collected_time.tv_sec + 60))) { - if (!rrdcalc_isrepeating(rc)) { - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); - time_t now = now_realtime_sec(); - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0); - if (ae) { - health_alarm_log(host, ae); - rc->old_status = rc->status; - rc->status = RRDCALC_STATUS_REMOVED; - rc->last_status_change = now; - rc->last_updated = now; - rc->value = NAN; #ifdef ENABLE_ACLK - if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) - sql_queue_alarm_to_aclk(host, ae, 1); + if (netdata_cloud_setting && likely(!host->aclk_alert_reloaded)) + sql_queue_alarm_to_aclk(host, ae, 1); #endif - } } } + } - if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { - if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUNNABLE; - continue; - } - - runnable++; - rc->old_value = rc->value; - rc->rrdcalc_flags |= RRDCALC_FLAG_RUNNABLE; - - // ------------------------------------------------------------ - // if there is database lookup, do it - - if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { - worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); - - /* time_t old_db_timestamp = rc->db_before; */ - int value_is_null = 0; + if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) { + if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE)) + rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE; + continue; + } - int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, - rc->after, rc->before, rc->group, NULL, - 0, rc->options, - &rc->db_after,&rc->db_before, - NULL, NULL, NULL, - &value_is_null, NULL, 0, 0); + runnable++; + rc->old_value = rc->value; + rc->run_flags |= RRDCALC_FLAG_RUNNABLE; + + // ------------------------------------------------------------ + // if there is database lookup, do it + + if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) { + worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY); + + /* time_t old_db_timestamp = rc->db_before; */ + int value_is_null = 0; + + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1, + rc->after, rc->before, rc->group, NULL, + 0, rc->options, + &rc->db_after,&rc->db_before, + NULL, NULL, NULL, + &value_is_null, NULL, 0, 0, + QUERY_SOURCE_HEALTH); + + if (unlikely(ret != 200)) { + // database lookup failed + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_DB_ERROR; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret + ); + } else + rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR; + + /* - RRDCALC_FLAG_DB_STALE not currently used + if (unlikely(old_db_timestamp == rc->db_before)) { + // database is stale + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + + if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { + rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; + error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + } + } + else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) + rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; + */ + + if (unlikely(value_is_null)) { + // collected value is null + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc) + ); + } else + rc->run_flags &= ~RRDCALC_FLAG_DB_NAN; + + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value + ); + } - if (unlikely(ret != 200)) { - // database lookup failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_ERROR; + // ------------------------------------------------------------ + // if there is calculation expression, run it - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, ret - ); - } else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_ERROR; + if (unlikely(rc->calculation)) { + worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); - /* - RRDCALC_FLAG_DB_STALE not currently used - if (unlikely(old_db_timestamp == rc->db_before)) { - // database is stale + if (unlikely(!expression_evaluate(rc->calculation))) { + // calculation failed + rc->value = NAN; + rc->run_flags |= RRDCALC_FLAG_CALC_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) + ); + } else { + rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR; - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) { - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE; - error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name); - } - } - else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE)) - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE; - */ + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + NETDATA_DOUBLE_FORMAT + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + rc->calculation->parsed_as, rc->calculation->result, + buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc) + ); - if (unlikely(value_is_null)) { - // collected value is null - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_DB_NAN; + rc->value = rc->calculation->result; + } + } + } + foreach_rrdcalc_in_rrdhost_done(rc); - debug(D_HEALTH, - "Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name - ); - } else - rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; + if (unlikely(runnable && !netdata_exit)) { + foreach_rrdcalc_in_rrdhost_read(host, rc) { + if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE))) + continue; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->value - ); + if (rc->run_flags & RRDCALC_FLAG_DISABLED) { + continue; } + RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; + RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; - // ------------------------------------------------------------ - // if there is calculation expression, run it + // -------------------------------------------------------- + // check the warning expression - if (unlikely(rc->calculation)) { - worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL); + if (likely(rc->warning)) { + worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); - if (unlikely(!expression_evaluate(rc->calculation))) { + if (unlikely(!expression_evaluate(rc->warning))) { // calculation failed - rc->value = NAN; - rc->rrdcalc_flags |= RRDCALC_FLAG_CALC_ERROR; + rc->run_flags |= RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg) - ); + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + buffer_tostring(rc->warning->error_msg) + ); } else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; - - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " + rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - rc->calculation->parsed_as, rc->calculation->result, - buffer_tostring(rc->calculation->error_msg), rc->source - ); - - rc->value = rc->calculation->result; - - if (rc->local) rc->local->last_updated = now; - if (rc->family) rc->family->last_updated = now; - if (rc->hostid) rc->hostid->last_updated = now; - if (rc->hostname) rc->hostname->last_updated = now; + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), + rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc) + ); + warning_status = rrdcalc_value2status(rc->warning->result); } } - } - - rrdhost_unlock(host); - - if (unlikely(runnable && !netdata_exit)) { - rrdhost_rdlock(host); - for (rc = host->alarms; rc; rc = rc->next) { - if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUNNABLE))) - continue; + // -------------------------------------------------------- + // check the critical expression - if (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED) { - continue; - } - RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED; - RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED; - - // -------------------------------------------------------- - // check the warning expression - - if (likely(rc->warning)) { - worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL); - - if (unlikely(!expression_evaluate(rc->warning))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_WARN_ERROR; - - debug(D_HEALTH, - "Health on host '%s', alarm '%s.%s': warning expression failed with error: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - buffer_tostring(rc->warning->error_msg) - ); - } else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " - NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", - rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source - ); - warning_status = rrdcalc_value2status(rc->warning->result); - } - } + if (likely(rc->critical)) { + worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); - // -------------------------------------------------------- - // check the critical expression - - if (likely(rc->critical)) { - worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL); - - if (unlikely(!expression_evaluate(rc->critical))) { - // calculation failed - rc->rrdcalc_flags |= RRDCALC_FLAG_CRIT_ERROR; - - debug(D_HEALTH, - "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", - host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, - buffer_tostring(rc->critical->error_msg) - ); - } else { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " - NETDATA_DOUBLE_FORMAT - ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", - rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), - rc->source - ); - critical_status = rrdcalc_value2status(rc->critical->result); - } - } - - // -------------------------------------------------------- - // decide the final alarm status - - RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; - - switch (warning_status) { - case RRDCALC_STATUS_CLEAR: - status = RRDCALC_STATUS_CLEAR; - break; - - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_WARNING; - break; + if (unlikely(!expression_evaluate(rc->critical))) { + // calculation failed + rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR; - default: - break; + debug(D_HEALTH, + "Health on host '%s', alarm '%s.%s': critical expression failed with error: %s", + rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), + buffer_tostring(rc->critical->error_msg) + ); + } else { + rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR; + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " + NETDATA_DOUBLE_FORMAT + ": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), + rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg), + rrdcalc_source(rc) + ); + critical_status = rrdcalc_value2status(rc->critical->result); } + } - switch (critical_status) { - case RRDCALC_STATUS_CLEAR: - if (status == RRDCALC_STATUS_UNDEFINED) - status = RRDCALC_STATUS_CLEAR; - break; + // -------------------------------------------------------- + // decide the final alarm status - case RRDCALC_STATUS_RAISED: - status = RRDCALC_STATUS_CRITICAL; - break; + RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED; - default: - break; - } + switch (warning_status) { + case RRDCALC_STATUS_CLEAR: + status = RRDCALC_STATUS_CLEAR; + break; - // -------------------------------------------------------- - // check if the new status and the old differ + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_WARNING; + break; - if (status != rc->status) { - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); - int delay = 0; - - // apply trigger hysteresis + default: + break; + } - if (now > rc->delay_up_to_timestamp) { - rc->delay_up_current = rc->delay_up_duration; - rc->delay_down_current = rc->delay_down_duration; - rc->delay_last = 0; - rc->delay_up_to_timestamp = 0; - } else { - rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); - if (rc->delay_up_current > rc->delay_max_duration) - rc->delay_up_current = rc->delay_max_duration; + switch (critical_status) { + case RRDCALC_STATUS_CLEAR: + if (status == RRDCALC_STATUS_UNDEFINED) + status = RRDCALC_STATUS_CLEAR; + break; - rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); - if (rc->delay_down_current > rc->delay_max_duration) - rc->delay_down_current = rc->delay_max_duration; - } + case RRDCALC_STATUS_RAISED: + status = RRDCALC_STATUS_CRITICAL; + break; - if (status > rc->status) - delay = rc->delay_up_current; - else - delay = rc->delay_down_current; + default: + break; + } - // COMMENTED: because we do need to send raising alarms - // if(now + delay < rc->delay_up_to_timestamp) - // delay = (int)(rc->delay_up_to_timestamp - now); + // -------------------------------------------------------- + // check if the new status and the old differ - rc->delay_last = delay; - rc->delay_up_to_timestamp = now + delay; + if (status != rc->status) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + int delay = 0; + // apply trigger hysteresis - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - ); - health_alarm_log(host, ae); + if (now > rc->delay_up_to_timestamp) { + rc->delay_up_current = rc->delay_up_duration; + rc->delay_down_current = rc->delay_down_duration; + rc->delay_last = 0; + rc->delay_up_to_timestamp = 0; + } else { + rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier); + if (rc->delay_up_current > rc->delay_max_duration) + rc->delay_up_current = rc->delay_max_duration; - rc->last_status_change = now; - rc->old_status = rc->status; - rc->status = status; + rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier); + if (rc->delay_down_current > rc->delay_max_duration) + rc->delay_down_current = rc->delay_max_duration; } - rc->last_updated = now; - rc->next_update = now + rc->update_every; - - if (next_run > rc->next_update) - next_run = rc->next_update; + if (status > rc->status) + delay = rc->delay_up_current; + else + delay = rc->delay_down_current; + + // COMMENTED: because we do need to send raising alarms + // if(now + delay < rc->delay_up_to_timestamp) + // delay = (int)(rc->delay_up_to_timestamp - now); + + rc->delay_last = delay; + rc->delay_up_to_timestamp = now + delay; + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->status, + status, + rc->source, + rc->units, + rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) + ) + ); + + health_alarm_log_add_entry(host, ae); + + log_health("[%s]: Alert event for [%s.%s], value [%s], status [%s].", rrdhost_hostname(host), ae_chart_name(ae), ae_name(ae), ae_new_value_string(ae), rrdcalc_status2string(ae->new_status)); + + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; } - // process repeating alarms - RRDCALC *rc; - for(rc = host->alarms; rc ; rc = rc->next) { - int repeat_every = 0; - if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { - if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE; - repeat_every = rc->warn_repeat_every; - } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { - rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE; - repeat_every = rc->crit_repeat_every; - } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { - if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) { - if(rc->old_status == RRDCALC_STATUS_CRITICAL) { - repeat_every = 1; - } else if (rc->old_status == RRDCALC_STATUS_WARNING) { - repeat_every = 1; - } + rc->last_updated = now; + rc->next_update = now + rc->update_every; + + if (next_run > rc->next_update) + next_run = rc->next_update; + } + foreach_rrdcalc_in_rrdhost_done(rc); + + // process repeating alarms + foreach_rrdcalc_in_rrdhost_read(host, rc) { + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE; + repeat_every = rc->warn_repeat_every; + } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { + rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE; + repeat_every = rc->crit_repeat_every; + } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { + if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) { + if(rc->old_status == RRDCALC_STATUS_CRITICAL) { + repeat_every = 1; + } else if (rc->old_status == RRDCALC_STATUS_WARNING) { + repeat_every = 1; } } - } else { - continue; } + } else { + continue; + } - if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); - rc->last_repeat = now; - if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; - ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, - rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - ); - ae->last_repeat = rc->last_repeat; - if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { - ae->flags |= HEALTH_ENTRY_RUN_ONCE; - } - rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE; - health_process_notifications(host, ae); - debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); - health_alarm_wait_for_execution(ae); - health_alarm_log_free_one_nochecks_nounlink(ae); + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); + rc->last_repeat = now; + if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; + + ALARM_ENTRY *ae = health_create_alarm_entry( + host, + rc->id, + rc->next_event_id++, + rc->config_hash_id, + now, + rc->name, + rc->rrdset->id, + rc->rrdset->context, + rc->rrdset->family, + rc->classification, + rc->component, + rc->type, + rc->exec, + rc->recipient, + now - rc->last_status_change, + rc->old_value, + rc->value, + rc->old_status, + rc->status, + rc->source, + rc->units, + rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) | + (rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0) + ) + ); + + ae->last_repeat = rc->last_repeat; + if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { + ae->flags |= HEALTH_ENTRY_RUN_ONCE; } + rc->run_flags |= RRDCALC_FLAG_RUN_ONCE; + health_process_notifications(host, ae); + debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_wait_for_execution(ae); + health_alarm_log_free_one_nochecks_nounlink(ae); } - - rrdhost_unlock(host); } + foreach_rrdcalc_in_rrdhost_done(rc); + } - if (unlikely(netdata_exit)) - break; + if (unlikely(netdata_exit)) + break; - // execute notifications - // and cleanup - worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); - health_alarm_log_process(host); + // execute notifications + // and cleanup + worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS); + health_alarm_log_process(host); - if (unlikely(netdata_exit)) { - // wait for all notifications to finish before allowing health to be cleaned up - ALARM_ENTRY *ae; - while (NULL != (ae = alarm_notifications_in_progress.head)) { - health_alarm_wait_for_execution(ae); - } - break; + if (unlikely(netdata_exit)) { + // wait for all notifications to finish before allowing health to be cleaned up + ALARM_ENTRY *ae; + while (NULL != (ae = alarm_notifications_in_progress.head)) { + health_alarm_wait_for_execution(ae); } - - } /* rrdhost_foreach */ + break; + } // wait for all notifications to finish before allowing health to be cleaned up ALARM_ENTRY *ae; @@ -1215,34 +1533,49 @@ void *health_main(void *ptr) { } #ifdef ENABLE_ACLK - if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { - rrdhost_foreach_read(host) { - if (unlikely(!host->health_enabled)) - continue; - sql_queue_removed_alerts_to_aclk(host); - } - aclk_alert_reloaded = 0; - marked_aclk_reload_loop = 0; - } + if (netdata_cloud_setting && unlikely(host->aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { + sql_queue_removed_alerts_to_aclk(host); + host->aclk_alert_reloaded = 0; + marked_aclk_reload_loop = 0; + } #endif - rrd_unlock(); - if(unlikely(netdata_exit)) break; - now = now_realtime_sec(); - if(now < next_run) { - worker_is_idle(); - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now)); - sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now)); - now = now_realtime_sec(); - } - else - debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop); + health_sleep(next_run, loop, host); } // forever netdata_thread_cleanup_pop(1); return NULL; } + +void health_add_host_labels(void) { + DICTIONARY *labels = localhost->rrdlabels; + + int is_ephemeral = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "is ephemeral", CONFIG_BOOLEAN_NO); + rrdlabels_add(labels, "_is_ephemeral", is_ephemeral ? "true" : "false", RRDLABEL_SRC_CONFIG); + + int has_unstable_connection = appconfig_get_boolean(&netdata_config, CONFIG_SECTION_HEALTH, "has unstable connection", CONFIG_BOOLEAN_NO); + rrdlabels_add(labels, "_has_unstable_connection", has_unstable_connection ? "true" : "false", RRDLABEL_SRC_CONFIG); +} + +void health_thread_spawn(RRDHOST * host) { + if(!host->health_spawn) { + char tag[NETDATA_THREAD_TAG_MAX + 1]; + snprintfz(tag, NETDATA_THREAD_TAG_MAX, "HEALTH[%s]", rrdhost_hostname(host)); + struct health_state *health = callocz(1, sizeof(*health)); + health->host = host; + + if(netdata_thread_create(&host->health_thread, tag, NETDATA_THREAD_OPTION_JOINABLE, health_main, (void *) health)) { + log_health("[%s]: Failed to create new thread for client.", rrdhost_hostname(host)); + error("HEALTH [%s]: Failed to create new thread for client.", rrdhost_hostname(host)); + } + else { + log_health("[%s]: Created new thread for client.", rrdhost_hostname(host)); + host->health_spawn = 1; + host->aclk_alert_reloaded = 1; + } + } +} diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index ec4937c0a..b9d6c2374 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -1,15 +1,14 @@ - # detect dns query failure - template: dns_query_time_query_time - on: dns_query_time.query_time - class: Latency + template: dns_query_query_status + on: dns_query.query_status + class: Errors type: DNS component: DNS - lookup: average -10s unaligned foreach * - units: ms + calc: $success + units: status every: 10s - warn: $this == nan - delay: up 20s down 5m multiplier 1.5 max 1h - info: average DNS query round trip time over the last 10 seconds + warn: $this != nan && $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + info: DNS request type $label:record_type to server $label:server is unsuccessful to: sysadmin diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf index a84ab342f..cd87fe0e7 100644 --- a/health/health.d/go.d.plugin.conf +++ b/health/health.d/go.d.plugin.conf @@ -3,7 +3,7 @@ template: go.d_job_last_collected_secs on: netdata.go_plugin_execution_time - class: Error + class: Errors type: Netdata component: go.d.plugin module: !* * diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf index 9bcc81e76..6836ce7b1 100644 --- a/health/health.d/ml.conf +++ b/health/health.d/ml.conf @@ -1,10 +1,26 @@ # below are some examples of using the `anomaly-bit` option to define alerts based on anomaly # rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's # native anomaly detection here: -# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal +# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal # examples below are commented, you would need to uncomment and adjust as desired to enable them. +# node level anomaly rate example +# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate +# if node level anomaly rate is between 1-5% then warning (pick your own threshold that works best via tial and error). +# if node level anomaly rate is above 5% then critical (pick your own threshold that works best via tial and error). +# template: ml_1min_node_ar +# on: anomaly_detection.anomaly_rate +# os: linux +# hosts: * +# lookup: average -1m foreach anomaly_rate +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (1) : (5)) +# crit: $this > (($status == $CRITICAL) ? (5) : (100)) +# info: rolling 1min node level anomaly rate + # alert per dimension example # if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). @@ -33,4 +49,5 @@ # every: 30s # warn: $this > (($status >= $WARNING) ? (5) : (20)) # crit: $this > (($status == $CRITICAL) ? (20) : (100)) -# info: rolling 5min anomaly rate for system.cpu chart
\ No newline at end of file +# info: rolling 5min anomaly rate for system.cpu chart + diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 34452d983..3941c71cc 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -114,10 +114,10 @@ component: MySQL class: Utilization type: Database component: MySQL - lookup: max -2m absolute + lookup: max -2m at -1m unaligned units: nodes every: 10s - info: maximum galera cluster size in the last 2 minutes + info: maximum galera cluster size in the last 2 minutes starting one minute ago to: dba template: mysql_galera_cluster_size @@ -136,20 +136,29 @@ component: MySQL # galera node state - template: mysql_galera_cluster_state + template: mysql_galera_cluster_state_warn on: mysql.galera_cluster_state class: Errors type: Database component: MySQL - calc: $state + calc: $donor + $joined every: 10s - warn: $this == 2 OR $this == 3 - crit: $this == 0 OR $this == 1 OR $this >= 5 + warn: $this != nan AND $this != 0 delay: up 30s down 5m multiplier 1.5 max 1h - info: galera node state \ - (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent) + info: galera node state is either Donor/Desynced or Joined. to: dba + template: mysql_galera_cluster_state_crit + on: mysql.galera_cluster_state + class: Errors + type: Database +component: MySQL + calc: $undefined + $joining + $error + every: 10s + crit: $this != nan AND $this != 0 + delay: up 30s down 5m multiplier 1.5 max 1h + info: galera node state is either Undefined or Joining or Error. + to: dba # galera node status @@ -158,11 +167,10 @@ component: MySQL class: Errors type: Database component: MySQL - calc: $wsrep_cluster_status + calc: $primary every: 10s - crit: $mysql_galera_cluster_state != nan AND $this != 0 + crit: $this != nan AND $this != 1 delay: up 30s down 5m multiplier 1.5 max 1h - info: galera node cluster component status \ - (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \ - Any other value than primary indicates that the node is part of a nonoperational component. + info: galera node is part of a nonoperational component. \ + This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. to: dba diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf new file mode 100644 index 000000000..5f729d52b --- /dev/null +++ b/health/health.d/nvme.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: nvme_device_critical_warnings_state + families: * + on: nvme.device_critical_warnings_state + class: Errors + type: System +component: Disk + lookup: max -30s unaligned + units: state + every: 10s + crit: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 2h + info: NVMe device $label:device has critical warnings + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 2e5c1cbfd..ee6c57cc5 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -15,21 +15,6 @@ component: Pi-hole info: gravity.list (blocklist) file last update time to: sysadmin -# Gravity file check (gravity.list). - - template: pihole_blocklist_gravity_file - on: pihole.blocklist_last_update - class: Errors - type: Ad Filtering -component: Pi-hole - every: 10s - units: boolean - calc: $file_exists - crit: $this != 1 - delay: up 2m down 5m - info: gravity.list (blocklist) file existence state (0: not-exists, 1: exists) - to: sysadmin - # Pi-hole's ability to block unwanted domains. # Should be enabled. The whole point of Pi-hole! @@ -39,9 +24,9 @@ component: Pi-hole type: Ad Filtering component: Pi-hole every: 10s - units: boolean - calc: $enabled - warn: $this != 1 + units: status + calc: $disabled + warn: $this != nan AND $this == 1 delay: up 2m down 5m - info: unwanted domains blocking status (0: disabled, 1: enabled) + info: unwanted domains blocking is disabled to: sysadmin diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf new file mode 100644 index 000000000..cbe7c30c9 --- /dev/null +++ b/health/health.d/ping.conf @@ -0,0 +1,50 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: ping_host_reachable + families: * + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -30s unaligned of loss + calc: $this != nan AND $this < 100 + units: up/down + every: 10s + crit: $this == 0 + delay: down 30m multiplier 1.5 max 2h + info: network host $label:host reachability status + to: sysadmin + + template: ping_packet_loss + families: * + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -10m unaligned of loss + green: 5 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + info: packet loss percentage to the network host $label:host over the last 10 minutes + to: sysadmin + + template: ping_host_latency + families: * + on: ping.host_rtt + class: Latency + type: Other +component: Network + lookup: average -10s unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + info: average latency to the network host $label:host over the last 10 seconds + to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf new file mode 100644 index 000000000..66d034cfe --- /dev/null +++ b/health/health.d/postgres.conf @@ -0,0 +1,214 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: postgres_total_connection_utilization + on: postgres.connections_utilization + class: Utilization + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: average total connection utilization over the last minute + to: dba + + template: postgres_acquired_locks_utilization + on: postgres.locks_utilization + class: Utilization + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (15) : (20)) + delay: down 15m multiplier 1.5 max 1h + info: average acquired locks utilization over the last minute + to: dba + + template: postgres_txid_exhaustion_perc + on: postgres.txid_exhaustion_perc + class: Utilization + type: Database +component: PostgreSQL + hosts: * + calc: $txid_exhaustion + units: % + every: 1m + warn: $this > 90 + delay: down 15m multiplier 1.5 max 1h + info: percent towards TXID wraparound + to: dba + +# Database alarms + + template: postgres_db_cache_io_ratio + on: postgres.db_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average cache hit ratio in db $label:database over the last minute + to: dba + + template: postgres_db_transactions_rollback_ratio + on: postgres.db_transactions_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -5m unaligned of rollback + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (2)) + delay: down 15m multiplier 1.5 max 1h + info: average aborted transactions percentage in db $label:database over the last five minutes + to: dba + + template: postgres_db_deadlocks_rate + on: postgres.db_deadlocks_rate + class: Errors + type: Database +component: PostgreSQL + hosts: * + lookup: sum -1m unaligned of deadlocks + units: deadlocks + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 15m multiplier 1.5 max 1h + info: number of deadlocks detected in db $label:database in the last minute + to: dba + +# Table alarms + + template: postgres_table_cache_io_ratio + on: postgres.table_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average cache hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_index_cache_io_ratio + on: postgres.table_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average index cache hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_toast_cache_io_ratio + on: postgres.table_toast_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average TOAST hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_toast_index_cache_io_ratio + on: postgres.table_toast_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + info: average index TOAST hit ratio in db $label:database table $label:table over the last minute + to: dba + + template: postgres_table_bloat_size_perc + on: postgres.table_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + hosts: * + calc: $bloat + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + info: bloat size percentage in db $label:database table $label:table + to: dba + + template: postgres_table_last_autovacuum_time + on: postgres.table_autovacuum_since_time + class: Errors + type: Database +component: PostgreSQL + hosts: !* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon + to: dba + + template: postgres_table_last_autoanalyze_time + on: postgres.table_autoanalyze_since_time + class: Errors + type: Database +component: PostgreSQL + hosts: !* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon + to: dba + +# Index alarms + + template: postgres_index_bloat_size_perc + on: postgres.index_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + hosts: * + calc: $bloat + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + info: bloat size percentage in db $label:database table $label:table index $label:index + to: dba diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf index e3b3d11cf..0e81a482f 100644 --- a/health/health.d/python.d.plugin.conf +++ b/health/health.d/python.d.plugin.conf @@ -3,7 +3,7 @@ template: python.d_job_last_collected_secs on: netdata.pythond_runtime - class: Error + class: Errors type: Netdata component: python.d.plugin module: !* * diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index cad5230c5..34d00b5df 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -1,3 +1,18 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: redis_connections_rejected + families: * + on: redis.connections + class: Errors + type: KV Storage +component: Redis + lookup: sum -1m unaligned of rejected + every: 10s + units: connections + warn: $this > 0 + info: connections rejected because of maxclients limit in the last minute + delay: down 5m multiplier 1.5 max 1h + to: dba template: redis_bgsave_broken families: * @@ -26,3 +41,17 @@ component: Redis info: duration of the on-going RDB save operation delay: down 5m multiplier 1.5 max 1h to: dba + + template: redis_master_link_down + families: * + on: redis.master_link_down_since_time + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $time + units: seconds + crit: $this != nan AND $this > 0 + info: time elapsed since the link between master and slave is down + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf index 38213a8db..531d62fac 100644 --- a/health/health.d/systemdunits.conf +++ b/health/health.d/systemdunits.conf @@ -1,142 +1,141 @@ -## Check if the are any systemd units in the failed state (crashed). -## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed. +# you can disable an alarm notification by setting the 'to' line to: silent ## Service units - template: systemd_service_units_state - on: systemd.service_units_state + template: systemd_service_unit_failed_state + on: systemd.service_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd service units are in the failed state + info: systemd service unit in the failed state to: sysadmin ## Socket units - template: systemd_socket_units_state + template: systemd_socket_unit_failed_state on: systemd.socket_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd socket units are in the failed state + info: systemd socket unit in the failed state to: sysadmin ## Target units - template: systemd_target_units_state + template: systemd_target_unit_failed_state on: systemd.target_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd target units are in the failed state + info: systemd target unit in the failed state to: sysadmin ## Path units - template: systemd_path_units_state + template: systemd_path_unit_failed_state on: systemd.path_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd path units are in the failed state + info: systemd path unit in the failed state to: sysadmin ## Device units - template: systemd_device_units_state + template: systemd_device_unit_failed_state on: systemd.device_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more the systemd device units are in the failed state + info: systemd device unit in the failed state to: sysadmin ## Mount units - template: systemd_mount_units_state + template: systemd_mount_unit_failed_state on: systemd.mount_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more the systemd mount units are in the failed state + info: systemd mount units in the failed state to: sysadmin ## Automount units - template: systemd_automount_units_state + template: systemd_automount_unit_failed_state on: systemd.automount_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd automount units are in the failed state + info: systemd automount unit in the failed state to: sysadmin ## Swap units - template: systemd_swap_units_state + template: systemd_swap_unit_failed_state on: systemd.swap_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd swap units are in the failed state + info: systemd swap units in the failed state to: sysadmin ## Scope units - template: systemd_scope_units_state + template: systemd_scope_unit_failed_state on: systemd.scope_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd scope units are in the failed state + info: systemd scope units in the failed state to: sysadmin ## Slice units - template: systemd_slice_units_state + template: systemd_slice_unit_failed_state on: systemd.slice_unit_state class: Errors type: Linux component: Systemd units - lookup: max -1s min2max - units: ok/failed + calc: $failed + units: state every: 10s - warn: $this != nan AND $this == 5 + warn: $this != nan AND $this == 1 delay: down 5m multiplier 1.5 max 1h - info: one or more systemd slice units are in the failed state + info: systemd slice units in the failed state to: sysadmin diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 35cb6366c..ff116db64 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -26,7 +26,7 @@ component: Network lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification info: average number of sent TCP RESETS over the last 10 seconds. \ @@ -60,7 +60,7 @@ component: Network lookup: average -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification info: average number of received TCP RESETS over the last 10 seconds. \ diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf index 23c18ba10..2e9b1a3cf 100644 --- a/health/health.d/timex.conf +++ b/health/health.d/timex.conf @@ -5,7 +5,7 @@ alarm: system_clock_sync_state on: system.clock_sync_state os: linux - class: Error + class: Errors type: System component: Clock calc: $state diff --git a/health/health.h b/health/health.h index 3e77c12a7..15d8326ee 100644 --- a/health/health.h +++ b/health/health.h @@ -14,6 +14,7 @@ extern unsigned int default_health_enabled; #define HEALTH_ENTRY_FLAG_SILENCED 0x00000010 #define HEALTH_ENTRY_RUN_ONCE 0x00000020 #define HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS 0x00000040 +#define HEALTH_ENTRY_FLAG_IS_REPEATING 0x00000080 #define HEALTH_ENTRY_FLAG_SAVED 0x10000000 #define HEALTH_ENTRY_FLAG_ACLK_QUEUED 0x20000000 @@ -31,65 +32,72 @@ extern unsigned int default_health_enabled; extern char *silencers_filename; -extern void health_init(void); +void health_init(void); -extern void health_reload(void); +void health_reload(void); -extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, NETDATA_DOUBLE *result); -extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); -extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); -extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); -extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); +void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); +void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); +void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); +void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); -extern int health_alarm_log_open(RRDHOST *host); -extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); -extern void health_alarm_log_load(RRDHOST *host); - -extern ALARM_ENTRY* health_create_alarm_entry( - RRDHOST *host, - uint32_t alarm_id, - uint32_t alarm_event_id, - uuid_t config_hash_id, - time_t when, - const char *name, - const char *chart, - const char *chart_context, - const char *family, - const char *classification, - const char *component, - const char *type, - const char *exec, - const char *recipient, - time_t duration, - NETDATA_DOUBLE old_value, - NETDATA_DOUBLE new_value, - RRDCALC_STATUS old_status, - RRDCALC_STATUS new_status, - const char *source, - const char *units, - const char *info, - int delay, - uint32_t flags); - -extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae); - -extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); -extern char *health_user_config_dir(void); -extern char *health_stock_config_dir(void); -extern void health_alarm_log_free(RRDHOST *host); - -extern void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); - -extern void *health_cmdapi_thread(void *ptr); - -extern void health_label_log_save(RRDHOST *host); - -extern char *health_edit_command_from_source(const char *source); -extern void sql_refresh_hashes(void); - -extern SIMPLE_PATTERN *health_pattern_from_foreach(char *s); +int health_alarm_log_open(RRDHOST *host); +void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); +void health_alarm_log_load(RRDHOST *host); + +void health_thread_spawn(RRDHOST *host); +void health_thread_stop(RRDHOST *host); + +ALARM_ENTRY* health_create_alarm_entry( + RRDHOST *host, + uint32_t alarm_id, + uint32_t alarm_event_id, + const uuid_t config_hash_id, + time_t when, + STRING *name, + STRING *chart, + STRING *chart_context, + STRING *family, + STRING *classification, + STRING *component, + STRING *type, + STRING *exec, + STRING *recipient, + time_t duration, + NETDATA_DOUBLE old_value, + NETDATA_DOUBLE new_value, + RRDCALC_STATUS old_status, + RRDCALC_STATUS new_status, + STRING *source, + STRING *units, + STRING *info, + int delay, + uint32_t flags); + +void health_alarm_log_add_entry(RRDHOST *host, ALARM_ENTRY *ae); + +struct health_state { + RRDHOST *host; + netdata_thread_t thread; +}; + +void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); +char *health_user_config_dir(void); +char *health_stock_config_dir(void); +void health_alarm_log_free(RRDHOST *host); + +void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae); + +void *health_cmdapi_thread(void *ptr); + +void health_label_log_save(RRDHOST *host); + +char *health_edit_command_from_source(const char *source); +void sql_refresh_hashes(void); + +void health_add_host_labels(void); #endif //NETDATA_HEALTH_H diff --git a/health/health_config.c b/health/health_config.c index e1dd32ab1..f9decfad5 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -33,148 +33,6 @@ #define HEALTH_HOST_LABEL_KEY "host labels" #define HEALTH_FOREACH_KEY "foreach" -static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { - if(!rc->chart) { - error("Health configuration for alarm '%s' does not have a chart", rc->name); - return 0; - } - - if(!rc->update_every) { - error("Health configuration for alarm '%s.%s' has no frequency (parameter 'every'). Ignoring it.", rc->chart?rc->chart:"NOCHART", rc->name); - return 0; - } - - if(!RRDCALC_HAS_DB_LOOKUP(rc) && !rc->calculation && !rc->warning && !rc->critical) { - error("Health configuration for alarm '%s.%s' is useless (no db lookup, no calculation, no warning and no critical expressions)", rc->chart?rc->chart:"NOCHART", rc->name); - return 0; - } - - if (rrdcalc_exists(host, rc->chart, rc->name, rc->hash_chart, rc->hash)) - return 0; - - rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); - - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO - ", red " NETDATA_DOUBLE_FORMAT_AUTO - ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", - rc->chart?rc->chart:"NOCHART", - rc->name, - rc->id, - (rc->exec)?rc->exec:"DEFAULT", - (rc->recipient)?rc->recipient:"DEFAULT", - rc->green, - rc->red, - (int)rc->group, - rc->after, - rc->before, - rc->options, - (rc->dimensions)?rc->dimensions:"NONE", - (rc->foreachdim)?rc->foreachdim:"NONE", - rc->update_every, - (rc->calculation)?rc->calculation->parsed_as:"NONE", - (rc->warning)?rc->warning->parsed_as:"NONE", - (rc->critical)?rc->critical->parsed_as:"NONE", - rc->source, - rc->delay_up_duration, - rc->delay_down_duration, - rc->delay_max_duration, - rc->delay_multiplier, - rc->warn_repeat_every, - rc->crit_repeat_every - ); - - rrdcalc_add_to_host(host, rc); - - return 1; -} - -static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCALCTEMPLATE *rt) { - if(unlikely(!rt->context)) { - error("Health configuration for template '%s' does not have a context", rt->name); - return 0; - } - - if(unlikely(!rt->update_every)) { - error("Health configuration for template '%s' has no frequency (parameter 'every'). Ignoring it.", rt->name); - return 0; - } - - if(unlikely(!RRDCALCTEMPLATE_HAS_DB_LOOKUP(rt) && !rt->calculation && !rt->warning && !rt->critical)) { - error("Health configuration for template '%s' is useless (no calculation, no warning and no critical evaluation)", rt->name); - return 0; - } - - RRDCALCTEMPLATE *t, *last = NULL; - if(!rt->foreachdim) { - for (t = host->templates; t ; last = t, t = t->next) { - if(unlikely(t->hash_name == rt->hash_name - && !strcmp(t->name, rt->name) - && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") - )) { - info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); - return 0; - } - } - - if(likely(last)) { - last->next = rt; - } - else { - rt->next = host->templates; - host->templates = rt; - } - } else { - for (t = host->alarms_template_with_foreach; t ; last = t, t = t->next) { - if(unlikely(t->hash_name == rt->hash_name - && !strcmp(t->name, rt->name) - && !strcmp(t->family_match?t->family_match:"*", rt->family_match?rt->family_match:"*") - )) { - info("Health configuration template '%s' already exists for host '%s'.", rt->name, host->hostname); - return 0; - } - } - - if(likely(last)) { - last->next = rt; - } - else { - rt->next = host->alarms_template_with_foreach; - host->alarms_template_with_foreach = rt; - } - } - - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO - ", red " NETDATA_DOUBLE_FORMAT_AUTO - ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", - rt->name, - (rt->context)?rt->context:"NONE", - (rt->exec)?rt->exec:"DEFAULT", - (rt->recipient)?rt->recipient:"DEFAULT", - rt->green, - rt->red, - (int)rt->group, - rt->after, - rt->before, - rt->options, - (rt->dimensions)?rt->dimensions:"NONE", - (rt->foreachdim)?rt->foreachdim:"NONE", - rt->update_every, - (rt->calculation)?rt->calculation->parsed_as:"NONE", - (rt->warning)?rt->warning->parsed_as:"NONE", - (rt->critical)?rt->critical->parsed_as:"NONE", - rt->source, - rt->delay_up_duration, - rt->delay_down_duration, - rt->delay_max_duration, - rt->delay_multiplier, - rt->warn_repeat_every, - rt->crit_repeat_every - ); - - - return 1; -} - static inline int health_parse_delay( size_t line, const char *filename, char *string, int *delay_up_duration, @@ -275,7 +133,7 @@ static inline uint32_t health_parse_options(const char *s) { buf[count] = '\0'; if(!strcasecmp(buf, "no-clear-notification") || !strcasecmp(buf, "no-clear")) - options |= RRDCALC_FLAG_NO_CLEAR_NOTIFICATION; + options |= RRDCALC_OPTION_NO_CLEAR_NOTIFICATION; else error("Ignoring unknown alarm option '%s'", buf); } @@ -334,13 +192,21 @@ static inline int health_parse_repeat( * * @param s the string that will be used to create the simple pattern. */ -SIMPLE_PATTERN *health_pattern_from_foreach(char *s) { + +static void dimension_remove_pipe_comma(char *str) { + while(*str) { + if(*str == '|' || *str == ',') *str = ' '; + str++; + } +} + +static SIMPLE_PATTERN *health_pattern_from_foreach(const char *s) { char *convert= strdupz(s); SIMPLE_PATTERN *val = NULL; + if(convert) { dimension_remove_pipe_comma(convert); val = simple_pattern_create(convert, NULL, SIMPLE_PATTERN_EXACT); - freez(convert); } @@ -350,18 +216,18 @@ SIMPLE_PATTERN *health_pattern_from_foreach(char *s) { static inline int health_parse_db_lookup( size_t line, const char *filename, char *string, RRDR_GROUPING *group_method, int *after, int *before, int *every, - uint32_t *options, char **dimensions, char **foreachdim + RRDCALC_OPTIONS *options, STRING **dimensions, STRING **foreachdim ) { debug(D_HEALTH, "Health configuration parsing database lookup %zu@%s: %s", line, filename, string); - if(*dimensions) freez(*dimensions); - if(*foreachdim) freez(*foreachdim); + if(*dimensions) string_freez(*dimensions); + if(*foreachdim) string_freez(*foreachdim); *dimensions = NULL; *foreachdim = NULL; *after = 0; *before = 0; *every = 0; - *options = 0; + *options = (*options) & RRDCALC_ALL_OPTIONS_EXCLUDING_THE_RRDR_ONES; // preserve rrdcalc options char *s = string, *key; @@ -453,7 +319,7 @@ static inline int health_parse_db_lookup( if(find) { *find = '\0'; } - *dimensions = strdupz(s); + *dimensions = string_strdupz(s); } if(!find) { @@ -462,7 +328,7 @@ static inline int health_parse_db_lookup( s = ++find; } else if(!strcasecmp(key, HEALTH_FOREACH_KEY )) { - *foreachdim = strdupz(s); + *foreachdim = string_strdupz(s); break; } else { @@ -474,10 +340,10 @@ static inline int health_parse_db_lookup( return 1; } -static inline char *health_source_file(size_t line, const char *file) { +static inline STRING *health_source_file(size_t line, const char *file) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%zu@%s", line, file); - return strdupz(buffer); + return string_strdupz(buffer); } char *health_edit_command_from_source(const char *source) @@ -496,7 +362,7 @@ char *health_edit_command_from_source(const char *source) netdata_configured_user_config_dir, file_no_path + 1, temp, - localhost->registry_hostname); + rrdhost_registry_hostname(localhost)); } else buffer[0] = '\0'; @@ -513,35 +379,35 @@ static inline void strip_quotes(char *s) { static inline void alert_config_free(struct alert_config *cfg) { - freez(cfg->alarm); - freez(cfg->template_key); - freez(cfg->os); - freez(cfg->host); - freez(cfg->on); - freez(cfg->families); - freez(cfg->plugin); - freez(cfg->module); - freez(cfg->charts); - freez(cfg->lookup); - freez(cfg->calc); - freez(cfg->warn); - freez(cfg->crit); - freez(cfg->every); - freez(cfg->green); - freez(cfg->red); - freez(cfg->exec); - freez(cfg->to); - freez(cfg->units); - freez(cfg->info); - freez(cfg->classification); - freez(cfg->component); - freez(cfg->type); - freez(cfg->delay); - freez(cfg->options); - freez(cfg->repeat); - freez(cfg->host_labels); - freez(cfg->p_db_lookup_dimensions); - freez(cfg->p_db_lookup_method); + string_freez(cfg->alarm); + string_freez(cfg->template_key); + string_freez(cfg->os); + string_freez(cfg->host); + string_freez(cfg->on); + string_freez(cfg->families); + string_freez(cfg->plugin); + string_freez(cfg->module); + string_freez(cfg->charts); + string_freez(cfg->lookup); + string_freez(cfg->calc); + string_freez(cfg->warn); + string_freez(cfg->crit); + string_freez(cfg->every); + string_freez(cfg->green); + string_freez(cfg->red); + string_freez(cfg->exec); + string_freez(cfg->to); + string_freez(cfg->units); + string_freez(cfg->info); + string_freez(cfg->classification); + string_freez(cfg->component); + string_freez(cfg->type); + string_freez(cfg->delay); + string_freez(cfg->options); + string_freez(cfg->repeat); + string_freez(cfg->host_labels); + string_freez(cfg->p_db_lookup_dimensions); + string_freez(cfg->p_db_lookup_method); freez(cfg); } @@ -670,23 +536,35 @@ static int health_readfile(const char *filename, void *data) { if(hash == hash_alarm && !strcasecmp(key, HEALTH_ALARM_KEY)) { if(rc) { - if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { - rrdcalc_free(rc); - } + if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this) + rrdcalc_free_unused_rrdcalc_loaded_from_config(rc); + else + rrdcalc_add_from_config(host, rc); + // health_add_alarms_loop(host, rc, ignore_this) ; } if(rt) { - if (!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) { - rrdcalctemplate_free(rt); - } + if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this) + rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt); + else + rrdcalctemplate_add_from_config(host, rt); + rt = NULL; } rc = callocz(1, sizeof(RRDCALC)); rc->next_event_id = 1; - rc->name = strdupz(value); - rc->hash = simple_hash(rc->name); + + { + char *tmp = strdupz(value); + if(rrdvar_fix_name(tmp)) + error("Health configuration renamed alarm '%s' to '%s'", value, tmp); + + rc->name = string_strdupz(tmp); + freez(tmp); + } + rc->source = health_source_file(line, filename); rc->green = NAN; rc->red = NAN; @@ -700,58 +578,62 @@ static int health_readfile(const char *filename, void *data) { alert_config_free(alert_cfg); alert_cfg = callocz(1, sizeof(struct alert_config)); - if(rrdvar_fix_name(rc->name)) - error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); - - alert_cfg->alarm = strdupz(rc->name); + alert_cfg->alarm = string_dup(rc->name); ignore_this = 0; } else if(hash == hash_template && !strcasecmp(key, HEALTH_TEMPLATE_KEY)) { if(rc) { // health_add_alarms_loop(host, rc, ignore_this) ; - if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { - rrdcalc_free(rc); - } + if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this) + rrdcalc_free_unused_rrdcalc_loaded_from_config(rc); + else + rrdcalc_add_from_config(host, rc); rc = NULL; } if(rt) { - if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) { - rrdcalctemplate_free(rt); - } + if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this) + rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt); + else + rrdcalctemplate_add_from_config(host, rt); } rt = callocz(1, sizeof(RRDCALCTEMPLATE)); - rt->name = strdupz(value); - rt->hash_name = simple_hash(rt->name); + + { + char *tmp = strdupz(value); + if(rrdvar_fix_name(tmp)) + error("Health configuration renamed template '%s' to '%s'", value, tmp); + + rt->name = string_strdupz(tmp); + freez(tmp); + } + rt->source = health_source_file(line, filename); rt->green = NAN; rt->red = NAN; - rt->delay_multiplier = 1.0; + rt->delay_multiplier = (float)1.0; rt->warn_repeat_every = host->health_default_warn_repeat_every; rt->crit_repeat_every = host->health_default_crit_repeat_every; if (alert_cfg) alert_config_free(alert_cfg); alert_cfg = callocz(1, sizeof(struct alert_config)); - if(rrdvar_fix_name(rt->name)) - error("Health configuration renamed template '%s' to '%s'", value, rt->name); - - alert_cfg->template_key = strdupz(rt->name); + alert_cfg->template_key = string_dup(rt->name); ignore_this = 0; } else if(hash == hash_os && !strcasecmp(key, HEALTH_OS_KEY)) { char *os_match = value; - if (alert_cfg) alert_cfg->os = strdupz(value); + if (alert_cfg) alert_cfg->os = string_strdupz(value); SIMPLE_PATTERN *os_pattern = simple_pattern_create(os_match, NULL, SIMPLE_PATTERN_EXACT); - if(!simple_pattern_matches(os_pattern, host->os)) { + if(!simple_pattern_matches(os_pattern, rrdhost_os(host))) { if(rc) - debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", host->hostname, rc->name, line, filename, os_match); + debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, os_match); if(rt) - debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", host->hostname, rt->name, line, filename, os_match); + debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: host O/S does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, os_match); ignore_this = 1; } @@ -760,15 +642,15 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_host && !strcasecmp(key, HEALTH_HOST_KEY)) { char *host_match = value; - if (alert_cfg) alert_cfg->host = strdupz(value); + if (alert_cfg) alert_cfg->host = string_strdupz(value); SIMPLE_PATTERN *host_pattern = simple_pattern_create(host_match, NULL, SIMPLE_PATTERN_EXACT); - if(!simple_pattern_matches(host_pattern, host->hostname)) { + if(!simple_pattern_matches(host_pattern, rrdhost_hostname(host))) { if(rc) - debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", host->hostname, rc->name, line, filename, host_match); + debug(D_HEALTH, "HEALTH on '%s' ignoring alarm '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalc_name(rc), line, filename, host_match); if(rt) - debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", host->hostname, rt->name, line, filename, host_match); + debug(D_HEALTH, "HEALTH on '%s' ignoring template '%s' defined at %zu@%s: hostname does not match '%s'", rrdhost_hostname(host), rrdcalctemplate_name(rt), line, filename, host_match); ignore_this = 1; } @@ -777,65 +659,68 @@ static int health_readfile(const char *filename, void *data) { } else if(rc) { if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { - alert_cfg->on = strdupz(value); + alert_cfg->on = string_strdupz(value); if(rc->chart) { - if(strcmp(rc->chart, value) != 0) + if(strcmp(rrdcalc_chart_name(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->chart, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_chart_name(rc), value, value); - freez(rc->chart); + string_freez(rc->chart); } - rc->chart = strdupz(value); - rc->hash_chart = simple_hash(rc->chart); + rc->chart = string_strdupz(value); } else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) { - alert_cfg->classification = strdupz(value); + strip_quotes(value); + + alert_cfg->classification = string_strdupz(value); if(rc->classification) { - if(strcmp(rc->classification, value) != 0) + if(strcmp(rrdcalc_classification(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->classification, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_classification(rc), value, value); - freez(rc->classification); + string_freez(rc->classification); } - rc->classification = strdupz(value); - strip_quotes(rc->classification); + rc->classification = string_strdupz(value); } else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) { - alert_cfg->component = strdupz(value); + strip_quotes(value); + + alert_cfg->component = string_strdupz(value); if(rc->component) { - if(strcmp(rc->component, value) != 0) + if(strcmp(rrdcalc_component(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->component, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_component(rc), value, value); - freez(rc->component); + string_freez(rc->component); } - rc->component = strdupz(value); - strip_quotes(rc->component); + rc->component = string_strdupz(value); } else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) { - alert_cfg->type = strdupz(value); + strip_quotes(value); + + alert_cfg->type = string_strdupz(value); if(rc->type) { - if(strcmp(rc->type, value) != 0) + if(strcmp(rrdcalc_type(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->type, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_type(rc), value, value); - freez(rc->type); + string_freez(rc->type); } - rc->type = strdupz(value); - strip_quotes(rc->type); + rc->type = string_strdupz(value); } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { - alert_cfg->lookup = strdupz(value); + alert_cfg->lookup = string_strdupz(value); health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before, - &rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim); - if(rc->foreachdim) { - rc->spdim = health_pattern_from_foreach(rc->foreachdim); - } + &rc->update_every, &rc->options, &rc->dimensions, &rc->foreach_dimension); + + if(rc->foreach_dimension) + rc->foreach_dimension_pattern = health_pattern_from_foreach(rrdcalc_foreachdim(rc)); + if (rc->after) { if (rc->dimensions) - alert_cfg->p_db_lookup_dimensions = strdupz(rc->dimensions); + alert_cfg->p_db_lookup_dimensions = string_dup(rc->dimensions); if (rc->group) - alert_cfg->p_db_lookup_method = strdupz(group_method2string(rc->group)); + alert_cfg->p_db_lookup_method = string_strdupz(group_method2string(rc->group)); alert_cfg->p_db_lookup_options = rc->options; alert_cfg->p_db_lookup_after = rc->after; alert_cfg->p_db_lookup_before = rc->before; @@ -843,248 +728,261 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - alert_cfg->every = strdupz(value); + alert_cfg->every = string_strdupz(value); if(!config_parse_duration(value, &rc->update_every)) error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", - line, filename, rc->name, key, value); + line, filename, rrdcalc_name(rc), key, value); alert_cfg->p_update_every = rc->update_every; } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { - alert_cfg->green = strdupz(value); + alert_cfg->green = string_strdupz(value); char *e; rc->green = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rc->name, key, e); + line, filename, rrdcalc_name(rc), key, e); } } else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { - alert_cfg->red = strdupz(value); + alert_cfg->red = string_strdupz(value); char *e; rc->red = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rc->name, key, e); + line, filename, rrdcalc_name(rc), key, e); } } else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { - alert_cfg->calc = strdupz(value); + alert_cfg->calc = string_strdupz(value); const char *failed_at = NULL; int error = 0; rc->calculation = expression_parse(value, &failed_at, &error); if(!rc->calculation) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rc->name, key, value, expression_strerror(error), failed_at); + line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); } } else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { - alert_cfg->warn = strdupz(value); + alert_cfg->warn = string_strdupz(value); const char *failed_at = NULL; int error = 0; rc->warning = expression_parse(value, &failed_at, &error); if(!rc->warning) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rc->name, key, value, expression_strerror(error), failed_at); + line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); } } else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { - alert_cfg->crit = strdupz(value); + alert_cfg->crit = string_strdupz(value); const char *failed_at = NULL; int error = 0; rc->critical = expression_parse(value, &failed_at, &error); if(!rc->critical) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rc->name, key, value, expression_strerror(error), failed_at); + line, filename, rrdcalc_name(rc), key, value, expression_strerror(error), failed_at); } } else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { - alert_cfg->exec = strdupz(value); + alert_cfg->exec = string_strdupz(value); if(rc->exec) { - if(strcmp(rc->exec, value) != 0) + if(strcmp(rrdcalc_exec(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->exec, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_exec(rc), value, value); - freez(rc->exec); + string_freez(rc->exec); } - rc->exec = strdupz(value); + rc->exec = string_strdupz(value); } else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { - alert_cfg->to = strdupz(value); + alert_cfg->to = string_strdupz(value); if(rc->recipient) { - if(strcmp(rc->recipient, value) != 0) + if(strcmp(rrdcalc_recipient(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->recipient, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_recipient(rc), value, value); - freez(rc->recipient); + string_freez(rc->recipient); } - rc->recipient = strdupz(value); + rc->recipient = string_strdupz(value); } else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { - alert_cfg->units = strdupz(value); + strip_quotes(value); + + alert_cfg->units = string_strdupz(value); if(rc->units) { - if(strcmp(rc->units, value) != 0) + if(strcmp(rrdcalc_units(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->units, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_units(rc), value, value); - freez(rc->units); + string_freez(rc->units); } - rc->units = strdupz(value); - strip_quotes(rc->units); + rc->units = string_strdupz(value); } else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { - alert_cfg->info = strdupz(value); + strip_quotes(value); + + alert_cfg->info = string_strdupz(value); if(rc->info) { - if(strcmp(rc->info, value) != 0) + if(strcmp(rrdcalc_info(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rc->name, key, rc->info, value, value); + line, filename, rrdcalc_name(rc), key, rrdcalc_info(rc), value, value); - freez(rc->info); + string_freez(rc->info); + string_freez(rc->original_info); } - rc->info = strdupz(value); - strip_quotes(rc->info); + rc->info = string_strdupz(value); + rc->original_info = string_dup(rc->info); } else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { - alert_cfg->delay = strdupz(value); + alert_cfg->delay = string_strdupz(value); health_parse_delay(line, filename, value, &rc->delay_up_duration, &rc->delay_down_duration, &rc->delay_max_duration, &rc->delay_multiplier); } else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { - alert_cfg->options = strdupz(value); + alert_cfg->options = string_strdupz(value); rc->options |= health_parse_options(value); } else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ - alert_cfg->repeat = strdupz(value); + alert_cfg->repeat = string_strdupz(value); health_parse_repeat(line, filename, value, &rc->warn_repeat_every, &rc->crit_repeat_every); } else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { - alert_cfg->host_labels = strdupz(value); + alert_cfg->host_labels = string_strdupz(value); if(rc->host_labels) { - if(strcmp(rc->host_labels, value) != 0) + if(strcmp(rrdcalc_host_labels(rc), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", - line, filename, rc->name, key, value, value); + line, filename, rrdcalc_name(rc), key, value, value); - freez(rc->host_labels); + string_freez(rc->host_labels); simple_pattern_free(rc->host_labels_pattern); } - rc->host_labels = simple_pattern_trim_around_equal(value); - rc->host_labels_pattern = simple_pattern_create(rc->host_labels, NULL, SIMPLE_PATTERN_EXACT); + { + char *tmp = simple_pattern_trim_around_equal(value); + rc->host_labels = string_strdupz(tmp); + freez(tmp); + } + rc->host_labels_pattern = simple_pattern_create(rrdcalc_host_labels(rc), NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { - alert_cfg->plugin = strdupz(value); - freez(rc->plugin_match); + alert_cfg->plugin = string_strdupz(value); + string_freez(rc->plugin_match); simple_pattern_free(rc->plugin_pattern); - rc->plugin_match = strdupz(value); - rc->plugin_pattern = simple_pattern_create(rc->plugin_match, NULL, SIMPLE_PATTERN_EXACT); + rc->plugin_match = string_strdupz(value); + rc->plugin_pattern = simple_pattern_create(rrdcalc_plugin_match(rc), NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { - alert_cfg->module = strdupz(value); - freez(rc->module_match); + alert_cfg->module = string_strdupz(value); + string_freez(rc->module_match); simple_pattern_free(rc->module_pattern); - rc->module_match = strdupz(value); - rc->module_pattern = simple_pattern_create(rc->module_match, NULL, SIMPLE_PATTERN_EXACT); + rc->module_match = string_strdupz(value); + rc->module_pattern = simple_pattern_create(rrdcalc_module_match(rc), NULL, SIMPLE_PATTERN_EXACT); } else { error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", - line, filename, rc->name, key); + line, filename, rrdcalc_name(rc), key); } } else if(rt) { if(hash == hash_on && !strcasecmp(key, HEALTH_ON_KEY)) { - alert_cfg->on = strdupz(value); + alert_cfg->on = string_strdupz(value); if(rt->context) { - if(strcmp(rt->context, value) != 0) + if(strcmp(string2str(rt->context), value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->context, value, value); + line, filename, rrdcalctemplate_name(rt), key, string2str(rt->context), value, value); - freez(rt->context); + string_freez(rt->context); } - rt->context = strdupz(value); - rt->hash_context = simple_hash(rt->context); + rt->context = string_strdupz(value); } else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) { - alert_cfg->classification = strdupz(value); + strip_quotes(value); + + alert_cfg->classification = string_strdupz(value); if(rt->classification) { - if(strcmp(rt->classification, value) != 0) + if(strcmp(rrdcalctemplate_classification(rt), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->classification, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_classification(rt), value, value); - freez(rt->classification); + string_freez(rt->classification); } - rt->classification = strdupz(value); - strip_quotes(rt->classification); + rt->classification = string_strdupz(value); } else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) { - alert_cfg->component = strdupz(value); + strip_quotes(value); + + alert_cfg->component = string_strdupz(value); if(rt->component) { - if(strcmp(rt->component, value) != 0) + if(strcmp(rrdcalctemplate_component(rt), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->component, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_component(rt), value, value); - freez(rt->component); + string_freez(rt->component); } - rt->component = strdupz(value); - strip_quotes(rt->component); + rt->component = string_strdupz(value); } else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) { - alert_cfg->type = strdupz(value); + strip_quotes(value); + + alert_cfg->type = string_strdupz(value); if(rt->type) { - if(strcmp(rt->type, value) != 0) + if(strcmp(rrdcalctemplate_type(rt), value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->type, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_type(rt), value, value); - freez(rt->type); + string_freez(rt->type); } - rt->type = strdupz(value); - strip_quotes(rt->type); + rt->type = string_strdupz(value); } else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) { - alert_cfg->families = strdupz(value); - freez(rt->family_match); + alert_cfg->families = string_strdupz(value); + string_freez(rt->family_match); simple_pattern_free(rt->family_pattern); - rt->family_match = strdupz(value); - rt->family_pattern = simple_pattern_create(rt->family_match, NULL, SIMPLE_PATTERN_EXACT); + rt->family_match = string_strdupz(value); + rt->family_pattern = simple_pattern_create(rrdcalctemplate_family_match(rt), NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { - alert_cfg->plugin = strdupz(value); - freez(rt->plugin_match); + alert_cfg->plugin = string_strdupz(value); + string_freez(rt->plugin_match); simple_pattern_free(rt->plugin_pattern); - rt->plugin_match = strdupz(value); - rt->plugin_pattern = simple_pattern_create(rt->plugin_match, NULL, SIMPLE_PATTERN_EXACT); + rt->plugin_match = string_strdupz(value); + rt->plugin_pattern = simple_pattern_create(rrdcalctemplate_plugin_match(rt), NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_module && !strcasecmp(key, HEALTH_MODULE_KEY)) { - alert_cfg->module = strdupz(value); - freez(rt->module_match); + alert_cfg->module = string_strdupz(value); + string_freez(rt->module_match); simple_pattern_free(rt->module_pattern); - rt->module_match = strdupz(value); - rt->module_pattern = simple_pattern_create(rt->module_match, NULL, SIMPLE_PATTERN_EXACT); + rt->module_match = string_strdupz(value); + rt->module_pattern = simple_pattern_create(rrdcalctemplate_module_match(rt), NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_charts && !strcasecmp(key, HEALTH_CHARTS_KEY)) { - alert_cfg->charts = strdupz(value); - freez(rt->charts_match); + alert_cfg->charts = string_strdupz(value); + string_freez(rt->charts_match); simple_pattern_free(rt->charts_pattern); - rt->charts_match = strdupz(value); - rt->charts_pattern = simple_pattern_create(rt->charts_match, NULL, SIMPLE_PATTERN_EXACT); + rt->charts_match = string_strdupz(value); + rt->charts_pattern = simple_pattern_create(rrdcalctemplate_charts_match(rt), NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) { - alert_cfg->lookup = strdupz(value); + alert_cfg->lookup = string_strdupz(value); health_parse_db_lookup(line, filename, value, &rt->group, &rt->after, &rt->before, - &rt->update_every, &rt->options, &rt->dimensions, &rt->foreachdim); - if(rt->foreachdim) { - rt->spdim = health_pattern_from_foreach(rt->foreachdim); - } + &rt->update_every, &rt->options, &rt->dimensions, &rt->foreach_dimension); + + if(rt->foreach_dimension) + rt->foreach_dimension_pattern = health_pattern_from_foreach(rrdcalctemplate_foreachdim(rt)); + if (rt->after) { if (rt->dimensions) - alert_cfg->p_db_lookup_dimensions = strdupz(rt->dimensions); + alert_cfg->p_db_lookup_dimensions = string_dup(rt->dimensions); + if (rt->group) - alert_cfg->p_db_lookup_method = strdupz(group_method2string(rt->group)); + alert_cfg->p_db_lookup_method = string_strdupz(group_method2string(rt->group)); + alert_cfg->p_db_lookup_options = rt->options; alert_cfg->p_db_lookup_after = rt->after; alert_cfg->p_db_lookup_before = rt->before; @@ -1092,137 +990,143 @@ static int health_readfile(const char *filename, void *data) { } } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - alert_cfg->every = strdupz(value); + alert_cfg->every = string_strdupz(value); if(!config_parse_duration(value, &rt->update_every)) error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", - line, filename, rt->name, key, value); + line, filename, rrdcalctemplate_name(rt), key, value); alert_cfg->p_update_every = rt->update_every; } else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { - alert_cfg->green = strdupz(value); + alert_cfg->green = string_strdupz(value); char *e; rt->green = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rt->name, key, e); + line, filename, rrdcalctemplate_name(rt), key, e); } } else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { - alert_cfg->red = strdupz(value); + alert_cfg->red = string_strdupz(value); char *e; rt->red = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", - line, filename, rt->name, key, e); + line, filename, rrdcalctemplate_name(rt), key, e); } } else if(hash == hash_calc && !strcasecmp(key, HEALTH_CALC_KEY)) { - alert_cfg->calc = strdupz(value); + alert_cfg->calc = string_strdupz(value); const char *failed_at = NULL; int error = 0; rt->calculation = expression_parse(value, &failed_at, &error); if(!rt->calculation) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rt->name, key, value, expression_strerror(error), failed_at); + line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); } } else if(hash == hash_warn && !strcasecmp(key, HEALTH_WARN_KEY)) { - alert_cfg->warn = strdupz(value); + alert_cfg->warn = string_strdupz(value); const char *failed_at = NULL; int error = 0; rt->warning = expression_parse(value, &failed_at, &error); if(!rt->warning) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rt->name, key, value, expression_strerror(error), failed_at); + line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); } } else if(hash == hash_crit && !strcasecmp(key, HEALTH_CRIT_KEY)) { - alert_cfg->crit = strdupz(value); + alert_cfg->crit = string_strdupz(value); const char *failed_at = NULL; int error = 0; rt->critical = expression_parse(value, &failed_at, &error); if(!rt->critical) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' has unparse-able expression '%s': %s at '%s'", - line, filename, rt->name, key, value, expression_strerror(error), failed_at); + line, filename, rrdcalctemplate_name(rt), key, value, expression_strerror(error), failed_at); } } else if(hash == hash_exec && !strcasecmp(key, HEALTH_EXEC_KEY)) { - alert_cfg->exec = strdupz(value); + alert_cfg->exec = string_strdupz(value); if(rt->exec) { - if(strcmp(rt->exec, value) != 0) + if(strcmp(rrdcalctemplate_exec(rt), value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->exec, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_exec(rt), value, value); - freez(rt->exec); + string_freez(rt->exec); } - rt->exec = strdupz(value); + rt->exec = string_strdupz(value); } else if(hash == hash_recipient && !strcasecmp(key, HEALTH_RECIPIENT_KEY)) { - alert_cfg->to = strdupz(value); + alert_cfg->to = string_strdupz(value); if(rt->recipient) { - if(strcmp(rt->recipient, value) != 0) + if(strcmp(rrdcalctemplate_recipient(rt), value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->recipient, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_recipient(rt), value, value); - freez(rt->recipient); + string_freez(rt->recipient); } - rt->recipient = strdupz(value); + rt->recipient = string_strdupz(value); } else if(hash == hash_units && !strcasecmp(key, HEALTH_UNITS_KEY)) { - alert_cfg->units = strdupz(value); + strip_quotes(value); + + alert_cfg->units = string_strdupz(value); if(rt->units) { - if(strcmp(rt->units, value) != 0) + if(strcmp(rrdcalctemplate_units(rt), value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->units, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_units(rt), value, value); - freez(rt->units); + string_freez(rt->units); } - rt->units = strdupz(value); - strip_quotes(rt->units); + rt->units = string_strdupz(value); } else if(hash == hash_info && !strcasecmp(key, HEALTH_INFO_KEY)) { - alert_cfg->info = strdupz(value); + strip_quotes(value); + + alert_cfg->info = string_strdupz(value); if(rt->info) { - if(strcmp(rt->info, value) != 0) + if(strcmp(rrdcalctemplate_info(rt), value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->info, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_info(rt), value, value); - freez(rt->info); + string_freez(rt->info); } - rt->info = strdupz(value); - strip_quotes(rt->info); + rt->info = string_strdupz(value); } else if(hash == hash_delay && !strcasecmp(key, HEALTH_DELAY_KEY)) { - alert_cfg->delay = strdupz(value); + alert_cfg->delay = string_strdupz(value); health_parse_delay(line, filename, value, &rt->delay_up_duration, &rt->delay_down_duration, &rt->delay_max_duration, &rt->delay_multiplier); } else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { - alert_cfg->options = strdupz(value); + alert_cfg->options = string_strdupz(value); rt->options |= health_parse_options(value); } else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ - alert_cfg->repeat = strdupz(value); + alert_cfg->repeat = string_strdupz(value); health_parse_repeat(line, filename, value, &rt->warn_repeat_every, &rt->crit_repeat_every); } else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { - alert_cfg->host_labels = strdupz(value); + alert_cfg->host_labels = string_strdupz(value); if(rt->host_labels) { - if(strcmp(rt->host_labels, value) != 0) + if(strcmp(rrdcalctemplate_host_labels(rt), value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->host_labels, value, value); + line, filename, rrdcalctemplate_name(rt), key, rrdcalctemplate_host_labels(rt), value, value); - freez(rt->host_labels); + string_freez(rt->host_labels); simple_pattern_free(rt->host_labels_pattern); } - rt->host_labels = simple_pattern_trim_around_equal(value); - rt->host_labels_pattern = simple_pattern_create(rt->host_labels, NULL, SIMPLE_PATTERN_EXACT); + { + char *tmp = simple_pattern_trim_around_equal(value); + rt->host_labels = string_strdupz(tmp); + freez(tmp); + } + rt->host_labels_pattern = simple_pattern_create(rrdcalctemplate_host_labels(rt), NULL, SIMPLE_PATTERN_EXACT); } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", - line, filename, rt->name, key); + line, filename, rrdcalctemplate_name(rt), key); } } else { @@ -1233,15 +1137,17 @@ static int health_readfile(const char *filename, void *data) { if(rc) { //health_add_alarms_loop(host, rc, ignore_this) ; - if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalc_add_alarm_from_config(host, rc)) { - rrdcalc_free(rc); - } + if(!alert_hash_and_store_config(rc->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this) + rrdcalc_free_unused_rrdcalc_loaded_from_config(rc); + else + rrdcalc_add_from_config(host, rc); } if(rt) { - if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this || !rrdcalctemplate_add_template_from_config(host, rt)) { - rrdcalctemplate_free(rt); - } + if(!alert_hash_and_store_config(rt->config_hash_id, alert_cfg, sql_store_hashes) || ignore_this) + rrdcalctemplate_free_unused_rrdcalctemplate_loaded_from_config(rt); + else + rrdcalctemplate_add_from_config(host, rt); } if (alert_cfg) @@ -1257,8 +1163,8 @@ void sql_refresh_hashes(void) } void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath) { - if(unlikely(!host->health_enabled)) { - debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", host->hostname); + if(unlikely(!host->health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) { + debug(D_HEALTH, "CONFIG health is not enabled for host '%s'", rrdhost_hostname(host)); return; } @@ -1266,10 +1172,11 @@ void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path CONFIG_BOOLEAN_YES); if (!stock_enabled) { - info("Netdata will not load stock alarms."); + log_health("[%s]: Netdata will not load stock alarms.", rrdhost_hostname(host)); stock_path = user_path; } recursive_config_double_dir_load(user_path, stock_path, subpath, health_readfile, (void *) host, 0); + log_health("[%s]: Read health configuration.", rrdhost_hostname(host)); sql_store_hashes = 0; } diff --git a/health/health_json.c b/health/health_json.c index 4e8f43761..2dd59fd46 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -14,7 +14,7 @@ void health_string2json(BUFFER *wb, const char *prefix, const char *label, const } void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { - char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); + char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN"); char config_hash_id[GUID_LEN + 1]; uuid_unparse_lower(ae->config_hash_id, config_hash_id); @@ -57,30 +57,30 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) "\t\t\"old_value_string\": \"%s\",\n" "\t\t\"last_repeat\": \"%lu\",\n" "\t\t\"silenced\": \"%s\",\n" - , host->hostname + , rrdhost_hostname(host) , host->utc_offset - , host->abbrev_timezone + , rrdhost_abbrev_timezone(host) , ae->unique_id , ae->alarm_id , ae->alarm_event_id , config_hash_id - , ae->name - , ae->chart - , ae->chart_context - , ae->family - , ae->classification?ae->classification:"Unknown" - , ae->component?ae->component:"Unknown" - , ae->type?ae->type:"Unknown" + , ae_name(ae) + , ae_chart_name(ae) + , ae_chart_context(ae) + , ae_family(ae) + , ae->classification?ae_classification(ae):"Unknown" + , ae->component?ae_component(ae):"Unknown" + , ae->type?ae_type(ae):"Unknown" , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false" , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false" , (unsigned long)ae->exec_run_timestamp , (ae->flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)?"true":"false" - , ae->exec?ae->exec:host->health_default_exec - , ae->recipient?ae->recipient:host->health_default_recipient + , ae->exec?ae_exec(ae):string2str(host->health_default_exec) + , ae->recipient?ae_recipient(ae):string2str(host->health_default_recipient) , ae->exec_code - , ae->source + , ae_source(ae) , edit_command - , ae->units?ae->units:"" + , ae_units(ae) , (unsigned long)ae->when , (unsigned long)ae->duration , (unsigned long)ae->non_clear_duration @@ -90,28 +90,13 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) , (unsigned long)ae->delay_up_to_timestamp , ae->updated_by_id , ae->updates_id - , ae->new_value_string - , ae->old_value_string + , ae_new_value_string(ae) + , ae_old_value_string(ae) , (unsigned long)ae->last_repeat , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false" ); - char *replaced_info = NULL; - if (likely(ae->info)) { - char *m = NULL; - replaced_info = strdupz(ae->info); - size_t pos = 0; - while ((m = strstr(replaced_info + pos, "$family"))) { - char *buf = NULL; - pos = m - replaced_info; - buf = find_and_replace(replaced_info, "$family", ae->family ? ae->family : "", m); - freez(replaced_info); - replaced_info = strdupz(buf); - freez(buf); - } - } - - health_string2json(wb, "\t\t", "info", replaced_info?replaced_info:"", ",\n"); + health_string2json(wb, "\t\t", "info", ae->info ? ae_info(ae) : "", ",\n"); if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) { buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n"); @@ -127,22 +112,23 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) buffer_strcat(wb, "\t}"); - freez(replaced_info); freez(edit_command); } void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { - netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); buffer_strcat(wb, "["); unsigned int max = host->health_log.max; unsigned int count = 0; - uint32_t hash_chart = 0; - if (chart) hash_chart = simple_hash(chart); + + STRING *chart_string = string_strdupz(chart); + + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); + ALARM_ENTRY *ae; for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) { - if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) { + if ((ae->unique_id > after) && (!chart || chart_string == ae->chart)) { if (likely(count)) buffer_strcat(wb, ","); health_alarm_entry2json_nolock(wb, ae, host); @@ -150,9 +136,11 @@ void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *char } } - buffer_strcat(wb, "\n]\n"); - netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + + string_freez(chart_string); + + buffer_strcat(wb, "\n]\n"); } static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { @@ -160,7 +148,7 @@ static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, buffer_sprintf(wb, "\t\t\"%s.%s\": {\n" "\t\t\t\"id\": %lu,\n" - , rc->chart, rc->name + , rrdcalc_chart_name(rc), rrdcalc_name(rc) , (unsigned long)rc->id); buffer_strcat(wb, "\t\t\t\"value\":"); @@ -180,22 +168,7 @@ static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) { char value_string[100 + 1]; - format_value_and_unit(value_string, 100, rc->value, rc->units, -1); - - char *replaced_info = NULL; - if (likely(rc->info)) { - char *m; - replaced_info = strdupz(rc->info); - size_t pos = 0; - while ((m = strstr(replaced_info + pos, "$family"))) { - char *buf = NULL; - pos = m - replaced_info; - buf = find_and_replace(replaced_info, "$family", (rc->rrdset && rc->rrdset->family) ? rc->rrdset->family : "", m); - freez(replaced_info); - replaced_info = strdupz(buf); - freez(buf); - } - } + format_value_and_unit(value_string, 100, rc->value, rrdcalc_units(rc), -1); char hash_id[GUID_LEN + 1]; uuid_unparse_lower(rc->config_hash_id, hash_id); @@ -234,23 +207,23 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"value_string\": \"%s\",\n" "\t\t\t\"last_repeat\": \"%lu\",\n" "\t\t\t\"times_repeat\": %lu,\n" - , rc->chart, rc->name + , rrdcalc_chart_name(rc), rrdcalc_name(rc) , (unsigned long)rc->id , hash_id - , rc->name - , rc->chart - , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:"" - , rc->classification?rc->classification:"Unknown" - , rc->component?rc->component:"Unknown" - , rc->type?rc->type:"Unknown" + , rrdcalc_name(rc) + , rrdcalc_chart_name(rc) + , (rc->rrdset)?rrdset_family(rc->rrdset):"" + , rc->classification?rrdcalc_classification(rc):"Unknown" + , rc->component?rrdcalc_component(rc):"Unknown" + , rc->type?rrdcalc_type(rc):"Unknown" , (rc->rrdset)?"true":"false" - , (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false" - , (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false" - , rc->exec?rc->exec:host->health_default_exec - , rc->recipient?rc->recipient:host->health_default_recipient - , rc->source - , rc->units?rc->units:"" - , replaced_info?replaced_info:"" + , (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false" + , (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false" + , rc->exec?rrdcalc_exec(rc):string2str(host->health_default_exec) + , rc->recipient?rrdcalc_recipient(rc):string2str(host->health_default_recipient) + , rrdcalc_source(rc) + , rrdcalc_units(rc) + , rrdcalc_info(rc) , rrdcalc_status2string(rc->status) , (unsigned long)rc->last_status_change , (unsigned long)rc->last_updated @@ -269,13 +242,13 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , (unsigned long)rc->times_repeat ); - if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) { + if(unlikely(rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)) { buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n"); } if(RRDCALC_HAS_DB_LOOKUP(rc)) { - if(rc->dimensions && *rc->dimensions) - health_string2json(wb, "\t\t\t", "lookup_dimensions", rc->dimensions, ",\n"); + if(rc->dimensions) + health_string2json(wb, "\t\t\t", "lookup_dimensions", rrdcalc_dimensions(rc), ",\n"); buffer_sprintf(wb, "\t\t\t\"db_after\": %lu,\n" @@ -322,8 +295,6 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC buffer_strcat(wb, "\n"); buffer_strcat(wb, "\t\t}"); - - freez(replaced_info); } //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) { @@ -336,27 +307,30 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL char *tok = NULL; char *p = NULL; - rrdhost_rdlock(host); - if (contexts) { p = (char*)buffer_tostring(contexts); while(p && *p && (tok = mystrsep(&p, ", |"))) { if(!*tok) continue; - for(rc = host->alarms; rc ; rc = rc->next) { + STRING *tok_string = string_strdupz(tok); + + foreach_rrdcalc_in_rrdhost_read(host, rc) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) continue; - if(unlikely(rc->rrdset && rc->rrdset->hash_context == simple_hash(tok) - && !strcmp(rc->rrdset->context, tok) - && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))) + if(unlikely(rc->rrdset + && rc->rrdset->context == tok_string + && ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))) numberOfAlarms++; } + foreach_rrdcalc_in_rrdhost_done(rc); + + string_freez(tok_string); } } else { - for(rc = host->alarms; rc ; rc = rc->next) { + foreach_rrdcalc_in_rrdhost_read(host, rc) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) @@ -364,16 +338,16 @@ void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCAL if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)) numberOfAlarms++; } + foreach_rrdcalc_in_rrdhost_done(rc); } buffer_sprintf(wb, "%d", numberOfAlarms); - rrdhost_unlock(host); } static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) { RRDCALC *rc; - int i; - for(i = 0, rc = host->alarms; rc ; rc = rc->next) { + int i = 0; + foreach_rrdcalc_in_rrdhost_read(host, rc) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; @@ -387,44 +361,43 @@ static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, v fp(host, wb, rc); i++; } + foreach_rrdcalc_in_rrdhost_done(rc); } void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) { - rrdhost_rdlock(host); buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," "\n\t\"latest_alarm_log_unique_id\": %u," "\n\t\"status\": %s," "\n\t\"now\": %lu," "\n\t\"alarms\": {\n", - host->hostname, + rrdhost_hostname(host), (host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0, host->health_enabled?"true":"false", (unsigned long)now_realtime_sec()); health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock); +// rrdhost_rdlock(host); // buffer_strcat(wb, "\n\t},\n\t\"templates\": {"); // RRDCALCTEMPLATE *rt; // for(rt = host->templates; rt ; rt = rt->next) // health_rrdcalctemplate2json_nolock(wb, rt); +// rrdhost_unlock(host); buffer_strcat(wb, "\n\t}\n}\n"); - rrdhost_unlock(host); } void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) { - rrdhost_rdlock(host); buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\"," "\n\t\"alarms\": {\n", - host->hostname); + rrdhost_hostname(host)); health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock); buffer_strcat(wb, "\n\t}\n}\n"); - rrdhost_unlock(host); } -static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, time_t mark) +static int have_recent_alarm(RRDHOST *host, uint32_t alarm_id, uint32_t mark) { ALARM_ENTRY *ae = host->health_log.alarms; diff --git a/health/health_log.c b/health/health_log.c index f0a05531d..8105e01ae 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -14,11 +14,11 @@ inline int health_alarm_log_open(RRDHOST *host) { if(host->health_log_fp) { if (setvbuf(host->health_log_fp, NULL, _IOLBF, 0) != 0) - error("HEALTH [%s]: cannot set line buffering on health log file '%s'.", host->hostname, host->health_log_filename); + error("HEALTH [%s]: cannot set line buffering on health log file '%s'.", rrdhost_hostname(host), host->health_log_filename); return 0; } - error("HEALTH [%s]: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", host->hostname, host->health_log_filename); + error("HEALTH [%s]: cannot open health log file '%s'. Health data will be lost in case of netdata or server crash.", rrdhost_hostname(host), host->health_log_filename); return -1; } @@ -45,13 +45,13 @@ static inline void health_log_rotate(RRDHOST *host) { snprintfz(old_filename, FILENAME_MAX, "%s.old", host->health_log_filename); if(unlink(old_filename) == -1 && errno != ENOENT) - error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, old_filename); + error("HEALTH [%s]: cannot remove old alarms log file '%s'", rrdhost_hostname(host), old_filename); if(link(host->health_log_filename, old_filename) == -1 && errno != ENOENT) - error("HEALTH [%s]: cannot move file '%s' to '%s'.", host->hostname, host->health_log_filename, old_filename); + error("HEALTH [%s]: cannot move file '%s' to '%s'.", rrdhost_hostname(host), host->health_log_filename, old_filename); if(unlink(host->health_log_filename) == -1 && errno != ENOENT) - error("HEALTH [%s]: cannot remove old alarms log file '%s'", host->hostname, host->health_log_filename); + error("HEALTH [%s]: cannot remove old alarms log file '%s'", rrdhost_hostname(host), host->health_log_filename); // open it with truncate host->health_log_fp = fopen(host->health_log_filename, "w"); @@ -59,7 +59,7 @@ static inline void health_log_rotate(RRDHOST *host) { if(host->health_log_fp) fclose(host->health_log_fp); else - error("HEALTH [%s]: cannot truncate health log '%s'", host->hostname, host->health_log_filename); + error("HEALTH [%s]: cannot truncate health log '%s'", rrdhost_hostname(host), host->health_log_filename); host->health_log_fp = NULL; @@ -75,12 +75,12 @@ inline void health_label_log_save(RRDHOST *host) { if(unlikely(host->health_log_fp)) { BUFFER *wb = buffer_create(1024); - rrdlabels_to_buffer(localhost->host_labels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL); + rrdlabels_to_buffer(localhost->rrdlabels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL); char *write = (char *) buffer_tostring(wb); if (unlikely(fprintf(host->health_log_fp, "L\t%s", write) < 0)) error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", - host->hostname, host->health_log_filename); + rrdhost_hostname(host), host->health_log_filename); else host->health_log_entries_written++; @@ -103,7 +103,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { "\t%s\t%s\t%s" "\n" , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A' - , host->hostname + , rrdhost_hostname(host) , ae->unique_id , ae->alarm_id @@ -118,14 +118,14 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { , (uint32_t)ae->exec_run_timestamp , (uint32_t)ae->delay_up_to_timestamp - , (ae->name)?ae->name:"" - , (ae->chart)?ae->chart:"" - , (ae->family)?ae->family:"" - , (ae->exec)?ae->exec:"" - , (ae->recipient)?ae->recipient:"" - , (ae->source)?ae->source:"" - , (ae->units)?ae->units:"" - , (ae->info)?ae->info:"" + , ae_name(ae) + , ae_chart_name(ae) + , ae_family(ae) + , ae_exec(ae) + , ae_recipient(ae) + , ae_source(ae) + , ae_units(ae) + , ae_info(ae) , ae->exec_code , ae->new_status @@ -135,11 +135,11 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { , ae->new_value , ae->old_value , (uint64_t)ae->last_repeat - , (ae->classification)?ae->classification:"Unknown" - , (ae->component)?ae->component:"Unknown" - , (ae->type)?ae->type:"Unknown" + , (ae->classification)?ae_classification(ae):"Unknown" + , (ae->component)?ae_component(ae):"Unknown" + , (ae->type)?ae_type(ae):"Unknown" ) < 0)) - error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename); + error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", rrdhost_hostname(host), host->health_log_filename); else { ae->flags |= HEALTH_ENTRY_FLAG_SAVED; host->health_log_entries_written++; @@ -156,18 +156,23 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { static uint32_t is_valid_alarm_id(RRDHOST *host, const char *chart, const char *name, uint32_t alarm_id) { - uint32_t hash_chart = simple_hash(chart); - uint32_t hash_name = simple_hash(name); + STRING *chart_string = string_strdupz(chart); + STRING *name_string = string_strdupz(name); + + uint32_t ret = 1; ALARM_ENTRY *ae; for(ae = host->health_log.alarms; ae ;ae = ae->next) { - if (unlikely( - ae->alarm_id == alarm_id && (!(ae->hash_name == hash_name && ae->hash_chart == hash_chart && - !strcmp(name, ae->name) && !strcmp(chart, ae->chart))))) { - return 0; + if (unlikely(ae->alarm_id == alarm_id && (!(chart_string == ae->chart && name_string == ae->name)))) { + ret = 0; + break; } } - return 1; + + string_freez(chart_string); + string_freez(name_string); + + return ret; } static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename) { @@ -177,6 +182,14 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char size_t line = 0, len = 0; ssize_t loaded = 0, updated = 0, errored = 0, duplicate = 0; + DICTIONARY *all_rrdcalcs = dictionary_create( + DICT_OPTION_NAME_LINK_DONT_CLONE | DICT_OPTION_VALUE_LINK_DONT_CLONE | DICT_OPTION_DONT_OVERWRITE_VALUE); + RRDCALC *rc; + foreach_rrdcalc_in_rrdhost_read(host, rc) { + dictionary_set(all_rrdcalcs, rrdcalc_name(rc), rc, sizeof(*rc)); + } + foreach_rrdcalc_in_rrdhost_done(rc); + netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); while((s = fgets_trim_len(buf, 65536, fp, &len))) { @@ -192,7 +205,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *s = '\0'; pointers[entries++] = ++s; if(entries >= max_entries) { - error("HEALTH [%s]: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", host->hostname, line, filename, max_entries); + error("HEALTH [%s]: line %zu of file '%s' has more than %d entries. Ignoring excessive entries.", rrdhost_hostname(host), line, filename, max_entries); break; } } @@ -206,7 +219,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char ALARM_ENTRY *ae = NULL; if(entries < 27) { - error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries); + error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", rrdhost_hostname(host), line, filename, entries); errored++; continue; } @@ -214,14 +227,14 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char // check that we have valid ids uint32_t unique_id = (uint32_t)strtoul(pointers[2], NULL, 16); if(!unique_id) { - error("HEALTH [%s]: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", host->hostname, line, filename, unique_id, pointers[2]); + error("HEALTH [%s]: line %zu of file '%s' states alarm entry with invalid unique id %u (%s). Ignoring it.", rrdhost_hostname(host), line, filename, unique_id, pointers[2]); errored++; continue; } uint32_t alarm_id = (uint32_t)strtoul(pointers[3], NULL, 16); if(!alarm_id) { - error("HEALTH [%s]: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", host->hostname, line, filename, alarm_id, pointers[3]); + error("HEALTH [%s]: line %zu of file '%s' states alarm entry for invalid alarm id %u (%s). Ignoring it.", rrdhost_hostname(host), line, filename, alarm_id, pointers[3]); errored++; continue; } @@ -232,18 +245,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char char* alarm_name = pointers[13]; last_repeat = (time_t)strtoul(pointers[27], NULL, 16); - RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); - if (!rc) { - for(rc = host->alarms; rc ; rc = rc->next) { - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl_t *)rc); - if(rdcmp != rc) { - error("Cannot insert the alarm index ID using log %s", rc->name); - } - } - - rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); - } - + rc = dictionary_get(all_rrdcalcs, alarm_name); if(unlikely(rc)) { if (rrdcalc_isrepeating(rc)) { rc->last_repeat = last_repeat; @@ -259,7 +261,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char // make sure it is properly numbered if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) { error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it." - , host->hostname, line, filename, unique_id); + , rrdhost_hostname(host), line, filename, unique_id); errored++; continue; } @@ -272,7 +274,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char if(unlikely(unique_id == ae->unique_id)) { if(unlikely(*pointers[0] == 'A')) { error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later." - , host->hostname, line, filename, unique_id); + , rrdhost_hostname(host), line, filename, unique_id); *pointers[0] = 'U'; duplicate++; } @@ -298,8 +300,13 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char // error("HEALTH [%s]: line %zu of file '%s' provides an alarm for host '%s' but this is named '%s'.", host->hostname, line, filename, pointers[1], host->hostname); ae->unique_id = unique_id; - if (!is_valid_alarm_id(host, pointers[14], pointers[13], alarm_id)) - alarm_id = rrdcalc_get_unique_id(host, pointers[14], pointers[13], NULL); + if (!is_valid_alarm_id(host, pointers[14], pointers[13], alarm_id)) { + STRING *chart = string_strdupz(pointers[14]); + STRING *name = string_strdupz(pointers[13]); + alarm_id = rrdcalc_get_unique_id(host, chart, name, NULL); + string_freez(chart); + string_freez(name); + } ae->alarm_id = alarm_id; ae->alarm_event_id = (uint32_t)strtoul(pointers[4], NULL, 16); ae->updated_by_id = (uint32_t)strtoul(pointers[5], NULL, 16); @@ -315,36 +322,29 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char ae->exec_run_timestamp = (uint32_t)strtoul(pointers[11], NULL, 16); ae->delay_up_to_timestamp = (uint32_t)strtoul(pointers[12], NULL, 16); - freez(ae->name); - ae->name = strdupz(pointers[13]); - ae->hash_name = simple_hash(ae->name); + string_freez(ae->name); + ae->name = string_strdupz(pointers[13]); - freez(ae->chart); - ae->chart = strdupz(pointers[14]); - ae->hash_chart = simple_hash(ae->chart); + string_freez(ae->chart); + ae->chart = string_strdupz(pointers[14]); - freez(ae->family); - ae->family = strdupz(pointers[15]); + string_freez(ae->family); + ae->family = string_strdupz(pointers[15]); - freez(ae->exec); - ae->exec = strdupz(pointers[16]); - if(!*ae->exec) { freez(ae->exec); ae->exec = NULL; } + string_freez(ae->exec); + ae->exec = string_strdupz(pointers[16]); - freez(ae->recipient); - ae->recipient = strdupz(pointers[17]); - if(!*ae->recipient) { freez(ae->recipient); ae->recipient = NULL; } + string_freez(ae->recipient); + ae->recipient = string_strdupz(pointers[17]); - freez(ae->source); - ae->source = strdupz(pointers[18]); - if(!*ae->source) { freez(ae->source); ae->source = NULL; } + string_freez(ae->source); + ae->source = string_strdupz(pointers[18]); - freez(ae->units); - ae->units = strdupz(pointers[19]); - if(!*ae->units) { freez(ae->units); ae->units = NULL; } + string_freez(ae->units); + ae->units = string_strdupz(pointers[19]); - freez(ae->info); - ae->info = strdupz(pointers[20]); - if(!*ae->info) { freez(ae->info); ae->info = NULL; } + string_freez(ae->info); + ae->info = string_strdupz(pointers[20]); ae->exec_code = str2i(pointers[21]); ae->new_status = str2i(pointers[22]); @@ -357,24 +357,21 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char ae->last_repeat = last_repeat; if (likely(entries > 30)) { - freez(ae->classification); - ae->classification = strdupz(pointers[28]); - if(!*ae->classification) { freez(ae->classification); ae->classification = NULL; } + string_freez(ae->classification); + ae->classification = string_strdupz(pointers[28]); - freez(ae->component); - ae->component = strdupz(pointers[29]); - if(!*ae->component) { freez(ae->component); ae->component = NULL; } + string_freez(ae->component); + ae->component = string_strdupz(pointers[29]); - freez(ae->type); - ae->type = strdupz(pointers[30]); - if(!*ae->type) { freez(ae->type); ae->type = NULL; } + string_freez(ae->type); + ae->type = string_strdupz(pointers[30]); } char value_string[100 + 1]; - freez(ae->old_value_string); - freez(ae->new_value_string); - ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1)); - ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1)); + string_freez(ae->old_value_string); + string_freez(ae->new_value_string); + ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1)); + ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1)); // add it to host if not already there if(unlikely(*pointers[0] == 'A')) { @@ -395,13 +392,16 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char host->health_max_alarm_id = ae->alarm_id; } else { - error("HEALTH [%s]: line %zu of file '%s' is invalid (unrecognized entry type '%s').", host->hostname, line, filename, pointers[0]); + error("HEALTH [%s]: line %zu of file '%s' is invalid (unrecognized entry type '%s').", rrdhost_hostname(host), line, filename, pointers[0]); errored++; } } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); + dictionary_destroy(all_rrdcalcs); + all_rrdcalcs = NULL; + freez(buf); if(!host->health_max_unique_id) host->health_max_unique_id = (uint32_t)now_realtime_sec(); @@ -411,7 +411,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char if (unlikely(!host->health_log.next_alarm_id || host->health_log.next_alarm_id <= host->health_max_alarm_id)) host->health_log.next_alarm_id = host->health_max_alarm_id + 1; - debug(D_HEALTH, "HEALTH [%s]: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", host->hostname, filename, loaded, updated, errored, duplicate); + debug(D_HEALTH, "HEALTH [%s]: loaded file '%s' with %zd new alarm entries, updated %zd alarms, errors %zd entries, duplicate %zd", rrdhost_hostname(host), filename, loaded, updated, errored, duplicate); return loaded; } @@ -422,7 +422,7 @@ inline void health_alarm_log_load(RRDHOST *host) { snprintfz(filename, FILENAME_MAX, "%s.old", host->health_log_filename); FILE *fp = fopen(filename, "r"); if(!fp) - error("HEALTH [%s]: cannot open health file: %s", host->hostname, filename); + error("HEALTH [%s]: cannot open health file: %s", rrdhost_hostname(host), filename); else { health_alarm_log_read(host, fp, filename); fclose(fp); @@ -431,7 +431,7 @@ inline void health_alarm_log_load(RRDHOST *host) { host->health_log_entries_written = 0; fp = fopen(host->health_log_filename, "r"); if(!fp) - error("HEALTH [%s]: cannot open health file: %s", host->hostname, host->health_log_filename); + error("HEALTH [%s]: cannot open health file: %s", rrdhost_hostname(host), host->health_log_filename); else { health_alarm_log_read(host, fp, host->health_log_filename); fclose(fp); @@ -443,63 +443,48 @@ inline void health_alarm_log_load(RRDHOST *host) { // health alarm log management inline ALARM_ENTRY* health_create_alarm_entry( - RRDHOST *host, - uint32_t alarm_id, - uint32_t alarm_event_id, - uuid_t config_hash_id, - time_t when, - const char *name, - const char *chart, - const char *chart_context, - const char *family, - const char *class, - const char *component, - const char *type, - const char *exec, - const char *recipient, - time_t duration, - NETDATA_DOUBLE old_value, - NETDATA_DOUBLE new_value, - RRDCALC_STATUS old_status, - RRDCALC_STATUS new_status, - const char *source, - const char *units, - const char *info, - int delay, - uint32_t flags + RRDHOST *host, + uint32_t alarm_id, + uint32_t alarm_event_id, + const uuid_t config_hash_id, + time_t when, + STRING *name, + STRING *chart, + STRING *chart_context, + STRING *family, + STRING *class, + STRING *component, + STRING *type, + STRING *exec, + STRING *recipient, + time_t duration, + NETDATA_DOUBLE old_value, + NETDATA_DOUBLE new_value, + RRDCALC_STATUS old_status, + RRDCALC_STATUS new_status, + STRING *source, + STRING *units, + STRING *info, + int delay, + uint32_t flags ) { debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id); ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY)); - ae->name = strdupz(name); - ae->hash_name = simple_hash(ae->name); - - if(chart) { - ae->chart = strdupz(chart); - ae->hash_chart = simple_hash(ae->chart); - } - - if(chart_context) - ae->chart_context = strdupz(chart_context); + ae->name = string_dup(name); + ae->chart = string_dup(chart); + ae->chart_context = string_dup(chart_context); uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); - if(family) - ae->family = strdupz(family); - - if (class) - ae->classification = strdupz(class); - - if (component) - ae->component = strdupz(component); - - if (type) - ae->type = strdupz(type); - - if(exec) ae->exec = strdupz(exec); - if(recipient) ae->recipient = strdupz(recipient); - if(source) ae->source = strdupz(source); - if(units) ae->units = strdupz(units); + ae->family = string_dup(family); + ae->classification = string_dup(class); + ae->component = string_dup(component); + ae->type = string_dup(type); + ae->exec = string_dup(exec); + ae->recipient = string_dup(recipient); + ae->source = string_dup(source); + ae->units = string_dup(units); ae->unique_id = host->health_log.next_log_id++; ae->alarm_id = alarm_id; @@ -509,27 +494,10 @@ inline ALARM_ENTRY* health_create_alarm_entry( ae->new_value = new_value; char value_string[100 + 1]; - ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1)); - ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1)); - - char *replaced_info = NULL; - if (likely(info)) { - char *m; - replaced_info = strdupz(info); - size_t pos = 0; - while ((m = strstr(replaced_info + pos, "$family"))) { - char *buf = NULL; - pos = m - replaced_info; - buf = find_and_replace(replaced_info, "$family", (ae->family) ? ae->family : "", m); - freez(replaced_info); - replaced_info = strdupz(buf); - freez(buf); - } - } - - if(replaced_info) ae->info = strdupz(replaced_info); - freez(replaced_info); + ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1)); + ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1)); + ae->info = string_dup(info); ae->old_status = old_status; ae->new_status = new_status; ae->duration = duration; @@ -545,7 +513,7 @@ inline ALARM_ENTRY* health_create_alarm_entry( return ae; } -inline void health_alarm_log( +inline void health_alarm_log_add_entry( RRDHOST *host, ALARM_ENTRY *ae ) { @@ -585,26 +553,24 @@ inline void health_alarm_log( } inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) { - freez(ae->name); - freez(ae->chart); - freez(ae->chart_context); - freez(ae->family); - freez(ae->classification); - freez(ae->component); - freez(ae->type); - freez(ae->exec); - freez(ae->recipient); - freez(ae->source); - freez(ae->units); - freez(ae->info); - freez(ae->old_value_string); - freez(ae->new_value_string); + string_freez(ae->name); + string_freez(ae->chart); + string_freez(ae->chart_context); + string_freez(ae->family); + string_freez(ae->classification); + string_freez(ae->component); + string_freez(ae->type); + string_freez(ae->exec); + string_freez(ae->recipient); + string_freez(ae->source); + string_freez(ae->units); + string_freez(ae->info); + string_freez(ae->old_value_string); + string_freez(ae->new_value_string); freez(ae); } inline void health_alarm_log_free(RRDHOST *host) { - rrdhost_check_wrlock(host); - netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *ae; diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 0dfecade5..3edf3d083 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -199,7 +199,7 @@ fi [ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" [ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@" [ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" -[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://app.netdata.cloud" +[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://api.netdata.cloud" # ----------------------------------------------------------------------------- # parse command line parameters @@ -250,7 +250,7 @@ fi # ----------------------------------------------------------------------------- # find a suitable hostname to use, if netdata did not supply a hostname -if [ -z ${args_host} ]; then +if [ -z "${args_host}" ]; then this_host=$(hostname -s 2>/dev/null) host="${this_host}" args_host="${this_host}" @@ -428,6 +428,10 @@ else done fi +if [[ ! $curl_options =~ .*\--connect-timeout ]]; then + curl_options+=" --connect-timeout 5" +fi + OPSGENIE_API_URL=${OPSGENIE_API_URL:-"https://api.opsgenie.com"} # If we didn't autodetect the character set for e-mail and it wasn't @@ -1335,21 +1339,37 @@ send_telegram() { if [ "${SEND_TELEGRAM}" = "YES" ] && [ -n "${bottoken}" ] && [ -n "${chatids}" ] && [ -n "${message}" ]; then for chatid in ${chatids}; do - # https://core.telegram.org/bots/api#sendmessage - httpcode=$(docurl ${disableNotification} \ - --data-urlencode "parse_mode=HTML" \ - --data-urlencode "disable_web_page_preview=true" \ - --data-urlencode "text=${emoji} ${message}" \ - "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}") - - if [ "${httpcode}" = "200" ]; then - info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" - sent=$((sent + 1)) - elif [ "${httpcode}" = "401" ]; then - error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." - else - error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP response status code ${httpcode}." - fi + notify_telegram=1 + notify_retries=${TELEGRAM_RETRIES_ON_LIMIT:-0} + + while [ ${notify_telegram} -eq 1 ]; do + # https://core.telegram.org/bots/api#sendmessage + httpcode=$(docurl ${disableNotification} \ + --data-urlencode "parse_mode=HTML" \ + --data-urlencode "disable_web_page_preview=true" \ + --data-urlencode "text=${emoji} ${message}" \ + "https://api.telegram.org/bot${bottoken}/sendMessage?chat_id=${chatid}") + + notify_telegram=0 + + if [ "${httpcode}" = "200" ]; then + info "sent telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}'" + sent=$((sent + 1)) + elif [ "${httpcode}" = "401" ]; then + error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': Wrong bot token." + elif [ "${httpcode}" = "429" ]; then + if [ "$notify_retries" -gt 0 ]; then + error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': rate limit exceeded, retrying after 1s." + notify_retries=$((notify_retries - 1)) + notify_telegram=1 + sleep 1 + else + error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}': rate limit exceeded." + fi + else + error "failed to send telegram notification for: ${host} ${chart}.${name} is ${status} to '${chatid}' with HTTP response status code ${httpcode}." + fi + done done [ ${sent} -gt 0 ] && return 0 @@ -2398,7 +2418,7 @@ status_email_subject="${status}" case "${status}" in CRITICAL) image="${images_base_url}/images/alert-128-red.png" - alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_critical.png" + alarm_badge="https://app.netdata.cloud/static/email/img/label_critical.png" status_message="is critical" status_email_subject="Critical" color="#ca414b" @@ -2411,7 +2431,7 @@ CRITICAL) WARNING) image="${images_base_url}/images/alert-128-orange.png" - alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_warning.png" + alarm_badge="https://app.netdata.cloud/static/email/img/label_warning.png" status_message="needs attention" status_email_subject="Warning" color="#ffc107" @@ -2424,7 +2444,7 @@ WARNING) CLEAR) image="${images_base_url}/images/check-mark-2-128-green.png" - alarm_badge="${NETDATA_REGISTRY_CLOUD_BASE_URL}/static/email/img/label_recovered.png" + alarm_badge="https://app.netdata.cloud/static/email/img/label_recovered.png" status_message="recovered" status_email_subject="Clear" color="#77ca6d" diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf index b69c6d538..52de86645 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/health/notifications/health_alarm_notify.conf @@ -443,6 +443,11 @@ SEND_TELEGRAM="YES" # Without it, netdata cannot send telegram messages. TELEGRAM_BOT_TOKEN="" +# If an API limit error is returned on sending a message, Netdata will retry this number of times before giving up. +# Setting the number to 0 makes Netdata do no retries (which is the default). +# See https://core.telegram.org/bots/faq#my-bot-is-hitting-limits-how-do-i-avoid-this +TELEGRAM_RETRIES_ON_LIMIT="0" + # To get your chat ID send the command /getid to telegram bot @myidbot # (https://t.me/myidbot). Each user also needs to open a conversation with the # bot that will be sending notifications. |