diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-08-12 07:26:17 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2022-08-12 07:26:17 +0000 |
commit | 7877a98bd9c00db5e81dd2f8c734cba2bab20be7 (patch) | |
tree | d18b767250f7c7ced9b8abe2ece784ac1fe24d3e /health | |
parent | Releasing debian version 1.35.1-2. (diff) | |
download | netdata-7877a98bd9c00db5e81dd2f8c734cba2bab20be7.tar.xz netdata-7877a98bd9c00db5e81dd2f8c734cba2bab20be7.zip |
Merging upstream version 1.36.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 1 | ||||
-rw-r--r-- | health/REFERENCE.md | 62 | ||||
-rw-r--r-- | health/health.c | 112 | ||||
-rw-r--r-- | health/health.d/cgroups.conf | 70 | ||||
-rw-r--r-- | health/health.d/go.d.plugin.conf | 2 | ||||
-rw-r--r-- | health/health.d/ml.conf | 36 | ||||
-rw-r--r-- | health/health.d/python.d.plugin.conf | 2 | ||||
-rw-r--r-- | health/health.d/ram.conf | 6 | ||||
-rw-r--r-- | health/health.d/redis.conf | 7 | ||||
-rw-r--r-- | health/health.d/web_log.conf | 214 | ||||
-rw-r--r-- | health/health.h | 7 | ||||
-rw-r--r-- | health/health_config.c | 42 | ||||
-rw-r--r-- | health/health_json.c | 2 | ||||
-rw-r--r-- | health/health_log.c | 32 | ||||
-rwxr-xr-x | health/notifications/alarm-notify.sh.in | 8 | ||||
-rw-r--r-- | health/notifications/msteams/README.md | 2 |
16 files changed, 301 insertions, 304 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index d5eb8846..777b3585 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -61,6 +61,7 @@ dist_healthconfig_DATA = \ health.d/megacli.conf \ health.d/memcached.conf \ health.d/memory.conf \ + health.d/ml.conf \ health.d/mysql.conf \ health.d/net.conf \ health.d/netfilter.conf \ diff --git a/health/REFERENCE.md b/health/REFERENCE.md index 3c1e53b2..d1af7476 100644 --- a/health/REFERENCE.md +++ b/health/REFERENCE.md @@ -895,6 +895,68 @@ lookup: mean -10s of user Since [`z = (x - mean) / stddev`](https://en.wikipedia.org/wiki/Standard_score) we create two input alarms, one for `mean` and one for `stddev` and then use them both as inputs in our final `cpu_user_zscore` alarm. +### Example 8 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based CPU dimensions alarm + +Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) for any CPU dimension is above 5%, critical if it goes above 20%: + +```yaml +template: ml_5min_cpu_dims + on: system.cpu + os: linux + hosts: * + lookup: average -5m anomaly-bit foreach * + calc: $this + units: % + every: 30s + warn: $this > (($status >= $WARNING) ? (5) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (100)) + info: rolling 5min anomaly rate for each system.cpu dimension +``` + +The `lookup` line will calculate the average anomaly rate of each `system.cpu` dimension over the last 5 minues. In this case +Netdata will create alarms for all dimensions of the chart. + +### Example 9 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based CPU chart alarm + +Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) averaged across all CPU dimensions is above 5%, critical if it goes above 20%: + +```yaml +template: ml_5min_cpu_chart + on: system.cpu + os: linux + hosts: * + lookup: average -5m anomaly-bit of * + calc: $this + units: % + every: 30s + warn: $this > (($status >= $WARNING) ? (5) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (100)) + info: rolling 5min anomaly rate for system.cpu chart +``` + +The `lookup` line will calculate the average anomaly rate across all `system.cpu` dimensions over the last 5 minues. In this case +Netdata will create one alarm for the chart. + +### Example 10 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based node level alarm + +Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) averaged across all ML enabled dimensions is above 5%, critical if it goes above 20%: + +```yaml +template: ml_5min_node + on: anomaly_detection.anomaly_rate + os: linux + hosts: * + lookup: average -5m of anomaly_rate + calc: $this + units: % + every: 30s + warn: $this > (($status >= $WARNING) ? (5) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (100)) + info: rolling 5min anomaly rate for all ML enabled dims +``` + +The `lookup` line will use the `anomaly_rate` dimension of the `anomaly_detection.anomaly_rate` ML chart to calculate the average [node level anomaly rate](https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate) over the last 5 minues. + ## Troubleshooting You can compile Netdata with [debugging](/daemon/README.md#debugging) and then set in `netdata.conf`: diff --git a/health/health.c b/health/health.c index 3c1e5693..9eb36a9c 100644 --- a/health/health.c +++ b/health/health.c @@ -11,6 +11,12 @@ static struct { ALARM_ENTRY *tail; // latest } alarm_notifications_in_progress = {NULL, NULL}; +typedef struct active_alerts { + char *name; + time_t last_status_change; + RRDCALC_STATUS status; +} active_alerts_t; + static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae) { ae->prev_in_progress = NULL; @@ -219,10 +225,6 @@ static void health_reload_host(RRDHOST *host) { * Reload the host configuration for all hosts. */ void health_reload(void) { -#ifdef ENABLE_ACLK - if (netdata_cloud_setting) - aclk_single_update_disable(); -#endif sql_refresh_hashes(); rrd_rdlock(); @@ -234,11 +236,7 @@ void health_reload(void) { rrd_unlock(); #ifdef ENABLE_ACLK if (netdata_cloud_setting) { - aclk_single_update_enable(); - aclk_alarm_reload(); -#ifdef ENABLE_NEW_CLOUD_PROTOCOL aclk_alert_reloaded = 1; -#endif } #endif } @@ -246,13 +244,22 @@ void health_reload(void) { // ---------------------------------------------------------------------------- // health main thread and friends -static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) { +static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) { if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED; if(n) return RRDCALC_STATUS_RAISED; return RRDCALC_STATUS_CLEAR; } #define ALARM_EXEC_COMMAND_LENGTH 8192 +#define ACTIVE_ALARMS_LIST_EXAMINE 500 +#define ACTIVE_ALARMS_LIST 15 + +static inline int compare_active_alerts(const void * a, const void * b) { + active_alerts_t *active_alerts_a = (active_alerts_t *)a; + active_alerts_t *active_alerts_b = (active_alerts_t *)b; + + return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change ); +} static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED; @@ -318,31 +325,28 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { RRDCALC *rc; EVAL_EXPRESSION *expr=NULL; BUFFER *warn_alarms, *crit_alarms; + active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t)); warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE); - for(rc = host->alarms; rc ; rc = rc->next) { + for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) { if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - if (n_warn) - buffer_strcat(warn_alarms, ","); - buffer_strcat(warn_alarms, rc->name); - buffer_strcat(warn_alarms, "="); - buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change); + active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; + active_alerts[n_warn+n_crit].status = rc->status; n_warn++; } else if (ae->alarm_id == rc->id) expr = rc->warning; } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) { - if (n_crit) - buffer_strcat(crit_alarms, ","); - buffer_strcat(crit_alarms, rc->name); - buffer_strcat(crit_alarms, "="); - buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change); + active_alerts[n_warn+n_crit].name = rc->name; + active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change; + active_alerts[n_warn+n_crit].status = rc->status; n_crit++; } else if (ae->alarm_id == rc->id) expr = rc->critical; @@ -352,9 +356,34 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { } } + if (n_warn+n_crit>1) + qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts); + + int count_w = 0, count_c = 0; + while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) { + if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) { + if (count_w) + buffer_strcat(warn_alarms, ","); + buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name); + buffer_strcat(warn_alarms, "="); + buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change); + count_w++; + } + else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) { + if (count_c) + buffer_strcat(crit_alarms, ","); + buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name); + buffer_strcat(crit_alarms, "="); + buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change); + count_c++; + } + } + char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN"); - snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", + snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO + "' '" NETDATA_DOUBLE_FORMAT_ZERO + "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'", exec, recipient, host->registry_hostname, @@ -398,6 +427,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { freez(edit_command); buffer_free(warn_alarms); buffer_free(crit_alarms); + freez(active_alerts); return; //health_alarm_wait_for_execution done: @@ -419,7 +449,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) { } static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) { - debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s", + debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s", ae->chart?ae->chart:"NOCHART", ae->name, ae->new_value, rrdcalc_status2string(ae->old_status), @@ -736,7 +766,7 @@ void *health_main(void *ptr) { rrdcalc_labels_unlink(); unsigned int loop = 0; -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK unsigned int marked_aclk_reload_loop = 0; #endif while(!netdata_exit) { @@ -765,7 +795,7 @@ void *health_main(void *ptr) { } } -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK if (aclk_alert_reloaded && !marked_aclk_reload_loop) marked_aclk_reload_loop = loop; #endif @@ -795,6 +825,11 @@ void *health_main(void *ptr) { host->health_delay_up_to = 0; } + // wait until cleanup of obsolete charts on children is complete + if (host != localhost) + if (unlikely(host->trigger_chart_obsoletion_check == 1)) + continue; + if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0)) sql_health_alarm_log_cleanup(host); @@ -818,7 +853,7 @@ void *health_main(void *ptr) { worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY); time_t now = now_realtime_sec(); ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0); if (ae) { @@ -828,7 +863,7 @@ void *health_main(void *ptr) { rc->last_status_change = now; rc->last_updated = now; rc->value = NAN; -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK if (netdata_cloud_setting && likely(!aclk_alert_reloaded)) sql_queue_alarm_to_aclk(host, ae, 1); #endif @@ -855,10 +890,12 @@ void *health_main(void *ptr) { /* time_t old_db_timestamp = rc->db_before; */ int value_is_null = 0; - int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after, - rc->before, rc->group, 0, rc->options, &rc->db_after, - &rc->db_before, &value_is_null, 0 - ); + int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, + rc->after, rc->before, rc->group, NULL, + 0, rc->options, + &rc->db_after,&rc->db_before, + NULL, NULL, NULL, + &value_is_null, NULL, 0, 0); if (unlikely(ret != 200)) { // database lookup failed @@ -898,8 +935,7 @@ void *health_main(void *ptr) { } else rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN; - debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " - CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, + debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->value ); } @@ -923,7 +959,7 @@ void *health_main(void *ptr) { rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value " - CALCULATED_NUMBER_FORMAT + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->calculation->parsed_as, rc->calculation->result, buffer_tostring(rc->calculation->error_msg), rc->source @@ -972,7 +1008,7 @@ void *health_main(void *ptr) { } else { rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value " - CALCULATED_NUMBER_FORMAT + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source ); @@ -998,7 +1034,7 @@ void *health_main(void *ptr) { } else { rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR; debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value " - CALCULATED_NUMBER_FORMAT + NETDATA_DOUBLE_FORMAT ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg), rc->source @@ -1077,7 +1113,7 @@ void *health_main(void *ptr) { ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, rc->delay_last, @@ -1129,7 +1165,7 @@ void *health_main(void *ptr) { rc->last_repeat = now; if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++; ALARM_ENTRY *ae = health_create_alarm_entry( - host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, + host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context, rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change, rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, rc->delay_last, @@ -1178,7 +1214,7 @@ void *health_main(void *ptr) { health_alarm_wait_for_execution(ae); } -#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL) +#ifdef ENABLE_ACLK if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) { rrdhost_foreach_read(host) { if (unlikely(!host->health_enabled)) diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index aa416c79..4bfe38b6 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -69,3 +69,73 @@ component: Network info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \ compared to the rate over the last minute to: sysadmin + +# ---------------------------------K8s containers-------------------------------------------- + + template: k8s_cgroup_10min_cpu_usage + on: k8s.cgroup.cpu_limit + class: Utilization + type: Cgroups +component: CPU + os: linux + hosts: * + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: average cgroup CPU utilization over the last 10 minutes + to: sysadmin + + template: k8s_cgroup_ram_in_use + on: k8s.cgroup.mem_usage + class: Utilization + type: Cgroups +component: Memory + os: linux + hosts: * + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: cgroup memory utilization + to: sysadmin + +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + + template: k8s_cgroup_1m_received_packets_rate + on: k8s.cgroup.net_packets + class: Workload + type: Cgroups +component: Network + hosts: * + lookup: average -1m unaligned of received + units: packets + every: 10s + info: average number of packets received by the network interface $family over the last minute + + template: k8s_cgroup_10s_received_packets_storm + on: k8s.cgroup.net_packets + class: Workload + type: Cgroups +component: Network + hosts: * + lookup: average -10s unaligned of received + calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) + options: no-clear-notification + info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \ + compared to the rate over the last minute + to: sysadmin diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf index 8bf84a97..a84ab342 100644 --- a/health/health.d/go.d.plugin.conf +++ b/health/health.d/go.d.plugin.conf @@ -6,7 +6,7 @@ class: Error type: Netdata component: go.d.plugin - module: * + module: !* * calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf new file mode 100644 index 00000000..9bcc81e7 --- /dev/null +++ b/health/health.d/ml.conf @@ -0,0 +1,36 @@ +# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly +# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's +# native anomaly detection here: +# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal + +# examples below are commented, you would need to uncomment and adjust as desired to enable them. + +# alert per dimension example +# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). +# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). +# template: ml_5min_cpu_dims +# on: system.cpu +# os: linux +# hosts: * +# lookup: average -5m anomaly-bit foreach * +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (5) : (20)) +# crit: $this > (($status == $CRITICAL) ? (20) : (100)) +# info: rolling 5min anomaly rate for each system.cpu dimension + +# alert per chart example +# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). +# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). +# template: ml_5min_cpu_chart +# on: system.cpu +# os: linux +# hosts: * +# lookup: average -5m anomaly-bit of * +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (5) : (20)) +# crit: $this > (($status == $CRITICAL) ? (20) : (100)) +# info: rolling 5min anomaly rate for system.cpu chart
\ No newline at end of file diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf index f3abc588..e3b3d11c 100644 --- a/health/health.d/python.d.plugin.conf +++ b/health/health.d/python.d.plugin.conf @@ -6,7 +6,7 @@ class: Error type: Netdata component: python.d.plugin - module: * + module: !* * calc: $now - $last_collected_t units: seconds ago every: 10s diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index ff5f3ac1..ab382c43 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -54,7 +54,7 @@ host labels: _is_k8s_node = false component: Memory os: freebsd hosts: * - calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) + calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) @@ -64,13 +64,13 @@ component: Memory to: sysadmin alarm: ram_available - on: system.ram + on: mem.available class: Utilization type: System component: Memory os: freebsd hosts: * - calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) + calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index dfb771e8..cad5230c 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -6,7 +6,7 @@ type: KV Storage component: Redis every: 10s - crit: $rdb_last_bgsave_status != 0 + crit: $last_bgsave != nan AND $last_bgsave != 0 units: ok/failed info: status of the last RDB save operation (0: ok, 1: error) delay: down 5m multiplier 1.5 max 1h @@ -19,8 +19,9 @@ component: Redis type: KV Storage component: Redis every: 10s - warn: $rdb_bgsave_in_progress > 600 - crit: $rdb_bgsave_in_progress > 1200 + calc: $current_bgsave_time + warn: $this > 600 + crit: $this > 1200 units: seconds info: duration of the on-going RDB save operation delay: down 5m multiplier 1.5 max 1h diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 454e0abe..c33c4664 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -1,218 +1,4 @@ -# ----------------------------------------------------------------------------- -# high level response code alarms - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - - template: 1m_requests - on: web_log.response_statuses - class: Workload - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned - calc: ($this == 0)?(1):($this) - units: requests - every: 10s - info: number of HTTP requests in the last minute - - template: 1m_successful - on: web_log.response_statuses - class: Workload - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned of successful_requests - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) - to: webmaster - - template: 1m_redirects - on: web_log.response_statuses - class: Workload - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned of redirects - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: ratio of redirection HTTP requests over the last minute (3xx except 304) - to: webmaster - - template: 1m_bad_requests - on: web_log.response_statuses - class: Errors - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned of bad_requests - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: ratio of client error HTTP requests over the last minute (4xx except 401) - to: webmaster - - template: 1m_internal_errors - on: web_log.response_statuses - class: Errors - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned of server_errors - calc: $this * 100 / $1m_requests - units: % - every: 10s - warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - info: ratio of server error HTTP requests over the last minute (5xx) - to: webmaster - -# unmatched lines - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_total_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - - template: 1m_total_requests - on: web_log.response_codes - class: Workload - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned - calc: ($this == 0)?(1):($this) - units: requests - every: 10s - info: number of HTTP requests over the last minute - - template: 1m_unmatched - on: web_log.response_codes - class: Errors - type: Web Server -component: Web log - families: * - lookup: sum -1m unaligned of unmatched - calc: $this * 100 / $1m_total_requests - units: % - every: 10s - warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 ) - delay: up 1m down 5m multiplier 1.5 max 1h - info: percentage of unparsed log lines over the last minute - to: webmaster - -# ----------------------------------------------------------------------------- -# web slow - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - - template: 10m_response_time - on: web_log.response_time - class: Latency - type: System -component: Web log - families: * - lookup: average -10m unaligned of avg - units: ms - every: 30s - info: average HTTP response time over the last 10 minutes - - template: web_slow - on: web_log.response_time - class: Latency - type: Web Server -component: Web log - families: * - lookup: average -1m unaligned of avg - units: ms - every: 10s - green: 500 - red: 1000 - warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 ) - crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 ) - delay: down 15m multiplier 1.5 max 1h - info: average HTTP response time over the last minute - options: no-clear-notification - to: webmaster - -# ----------------------------------------------------------------------------- -# web too many or too few requests - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $5m_successful_old > 120 -# -# i.e. when there were at least 120 requests during the 5 minutes starting -# at -10m and ending at -5m - - template: 5m_successful_old - on: web_log.response_statuses - class: Workload - type: Web Server -component: Web log - families: * - lookup: average -5m at -5m unaligned of successful_requests - units: requests/s - every: 30s - info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago - - template: 5m_successful - on: web_log.response_statuses - class: Workload - type: Web Server -component: Web log - families: * - lookup: average -5m unaligned of successful_requests - units: requests/s - every: 30s - info: average number of successful HTTP requests over the last 5 minutes - - template: 5m_requests_ratio - on: web_log.response_codes - class: Workload - type: Web Server -component: Web log - families: * - calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100) - units: % - every: 30s - warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) - crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) - delay: down 15m multiplier 1.5 max 1h - options: no-clear-notification - info: ratio of successful HTTP requests over the last 5 minutes, \ - compared with the previous 5 minutes \ - (clear notification for this alarm will not be sent) - to: webmaster - - - -# ---------------------------------------------------GO-VERSION--------------------------------------------------------- - # unmatched lines # the following alarms trigger only when there are enough data. diff --git a/health/health.h b/health/health.h index f25ae6bc..3e77c12a 100644 --- a/health/health.h +++ b/health/health.h @@ -35,7 +35,7 @@ extern void health_init(void); extern void health_reload(void); -extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result); +extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, NETDATA_DOUBLE *result); extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); @@ -56,6 +56,7 @@ extern ALARM_ENTRY* health_create_alarm_entry( time_t when, const char *name, const char *chart, + const char *chart_context, const char *family, const char *classification, const char *component, @@ -63,8 +64,8 @@ extern ALARM_ENTRY* health_create_alarm_entry( const char *exec, const char *recipient, time_t duration, - calculated_number old_value, - calculated_number new_value, + NETDATA_DOUBLE old_value, + NETDATA_DOUBLE new_value, RRDCALC_STATUS old_status, RRDCALC_STATUS new_status, const char *source, diff --git a/health/health_config.c b/health/health_config.c index df6d7b60..e1dd32ab 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -54,7 +54,9 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", + debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO + ", red " NETDATA_DOUBLE_FORMAT_AUTO + ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rc->chart?rc->chart:"NOCHART", rc->name, rc->id, @@ -141,7 +143,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL } } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", + debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO + ", red " NETDATA_DOUBLE_FORMAT_AUTO + ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rt->name, (rt->context)?rt->context:"NONE", (rt->exec)?rt->exec:"DEFAULT", @@ -848,7 +852,7 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { alert_cfg->green = strdupz(value); char *e; - rc->green = str2ld(value, &e); + rc->green = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", line, filename, rc->name, key, e); @@ -857,7 +861,7 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { alert_cfg->red = strdupz(value); char *e; - rc->red = str2ld(value, &e); + rc->red = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.", line, filename, rc->name, key, e); @@ -955,17 +959,17 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { alert_cfg->host_labels = strdupz(value); - if(rc->labels) { - if(strcmp(rc->labels, value) != 0) + if(rc->host_labels) { + if(strcmp(rc->host_labels, value) != 0) error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.", line, filename, rc->name, key, value, value); - freez(rc->labels); - simple_pattern_free(rc->splabels); + freez(rc->host_labels); + simple_pattern_free(rc->host_labels_pattern); } - rc->labels = simple_pattern_trim_around_equal(value); - rc->splabels = simple_pattern_create(rc->labels, NULL, SIMPLE_PATTERN_EXACT); + rc->host_labels = simple_pattern_trim_around_equal(value); + rc->host_labels_pattern = simple_pattern_create(rc->host_labels, NULL, SIMPLE_PATTERN_EXACT); } else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) { alert_cfg->plugin = strdupz(value); @@ -1097,7 +1101,7 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) { alert_cfg->green = strdupz(value); char *e; - rt->green = str2ld(value, &e); + rt->green = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", line, filename, rt->name, key, e); @@ -1106,7 +1110,7 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) { alert_cfg->red = strdupz(value); char *e; - rt->red = str2ld(value, &e); + rt->red = str2ndd(value, &e); if(e && *e) { error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.", line, filename, rt->name, key, e); @@ -1204,17 +1208,17 @@ static int health_readfile(const char *filename, void *data) { } else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) { alert_cfg->host_labels = strdupz(value); - if(rt->labels) { - if(strcmp(rt->labels, value) != 0) + if(rt->host_labels) { + if(strcmp(rt->host_labels, value) != 0) error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').", - line, filename, rt->name, key, rt->labels, value, value); + line, filename, rt->name, key, rt->host_labels, value, value); - freez(rt->labels); - simple_pattern_free(rt->splabels); + freez(rt->host_labels); + simple_pattern_free(rt->host_labels_pattern); } - rt->labels = simple_pattern_trim_around_equal(value); - rt->splabels = simple_pattern_create(rt->labels, NULL, SIMPLE_PATTERN_EXACT); + rt->host_labels = simple_pattern_trim_around_equal(value); + rt->host_labels_pattern = simple_pattern_create(rt->host_labels, NULL, SIMPLE_PATTERN_EXACT); } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", diff --git a/health/health_json.c b/health/health_json.c index d5285c11..4e8f4376 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -29,6 +29,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) "\t\t\"config_hash_id\": \"%s\",\n" "\t\t\"name\": \"%s\",\n" "\t\t\"chart\": \"%s\",\n" + "\t\t\"context\": \"%s\",\n" "\t\t\"family\": \"%s\",\n" "\t\t\"class\": \"%s\",\n" "\t\t\"component\": \"%s\",\n" @@ -65,6 +66,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) , config_hash_id , ae->name , ae->chart + , ae->chart_context , ae->family , ae->classification?ae->classification:"Unknown" , ae->component?ae->component:"Unknown" diff --git a/health/health_log.c b/health/health_log.c index 54f6dc9f..f0a05531 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -74,28 +74,15 @@ inline void health_label_log_save(RRDHOST *host) { if(unlikely(host->health_log_fp)) { BUFFER *wb = buffer_create(1024); - rrdhost_check_rdlock(host); - netdata_rwlock_rdlock(&host->labels.labels_rwlock); - struct label *l=localhost->labels.head; - while (l != NULL) { - buffer_sprintf(wb,"%s=%s\t ", l->key, l->value); - l = l->next; - } - netdata_rwlock_unlock(&host->labels.labels_rwlock); - - char *write = (char *) buffer_tostring(wb) ; - write[wb->len-2] = '\n'; - write[wb->len-1] = '\0'; + rrdlabels_to_buffer(localhost->host_labels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL); + char *write = (char *) buffer_tostring(wb); - if (unlikely(fprintf(host->health_log_fp, "L\t%s" - , write - ) < 0)) + if (unlikely(fprintf(host->health_log_fp, "L\t%s", write) < 0)) error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename); - else { + else host->health_log_entries_written++; - } buffer_free(wb); } @@ -111,7 +98,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { "\t%08x\t%08x\t%08x" "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" "\t%d\t%d\t%d\t%d" - "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO + "\t" NETDATA_DOUBLE_FORMAT_AUTO "\t" NETDATA_DOUBLE_FORMAT_AUTO "\t%016"PRIx64"" "\t%s\t%s\t%s" "\n" @@ -463,6 +450,7 @@ inline ALARM_ENTRY* health_create_alarm_entry( time_t when, const char *name, const char *chart, + const char *chart_context, const char *family, const char *class, const char *component, @@ -470,8 +458,8 @@ inline ALARM_ENTRY* health_create_alarm_entry( const char *exec, const char *recipient, time_t duration, - calculated_number old_value, - calculated_number new_value, + NETDATA_DOUBLE old_value, + NETDATA_DOUBLE new_value, RRDCALC_STATUS old_status, RRDCALC_STATUS new_status, const char *source, @@ -491,6 +479,9 @@ inline ALARM_ENTRY* health_create_alarm_entry( ae->hash_chart = simple_hash(ae->chart); } + if(chart_context) + ae->chart_context = strdupz(chart_context); + uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); if(family) @@ -596,6 +587,7 @@ inline void health_alarm_log( inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) { freez(ae->name); freez(ae->chart); + freez(ae->chart_context); freez(ae->family); freez(ae->classification); freez(ae->component); diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 38a69a0f..0dfecade 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -2898,6 +2898,10 @@ if [ -n "$total_crit_alarms" ]; then done <<<"$total_crit_alarms," fi +if (( total_warnings + total_critical > 15 )); then + EXTRA_ALARMS_LIST_TEXT="(Showing latest 15 alerts)" +fi + if [ -n "$edit_command_line" ]; then IFS='=' read -r edit_command line s_host <<<"$edit_command_line" fi @@ -3423,6 +3427,10 @@ Content-Transfer-Encoding: 8bit <span style="font-weight:600">${total_critical} critical</span> additional active alert(s)</div> </td> + </tr> + <td align="left" style="font-size:0px;padding:10px 25px;word-break:break-word;"> + <div style="font-family:Open Sans, sans-serif;font-size:12px;line-height:1;text-align:center;color:#35414A;">${EXTRA_ALARMS_LIST_TEXT}</div> + </td> </tr> </tbody> </table> diff --git a/health/notifications/msteams/README.md b/health/notifications/msteams/README.md index 14dbe751..c9a13bac 100644 --- a/health/notifications/msteams/README.md +++ b/health/notifications/msteams/README.md @@ -1,8 +1,6 @@ <!-- ---- title: "Microsoft Teams" custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/msteams/README.md ---- --> # Microsoft Teams |