summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2022-08-12 07:26:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2022-08-12 07:26:11 +0000
commit3c315f0fff93aa072472abc10815963ac0035268 (patch)
treea95f6a96e0e7bd139c010f8dc60b40e5b3062a99 /health
parentAdding upstream version 1.35.1. (diff)
downloadnetdata-upstream/1.36.0.tar.xz
netdata-upstream/1.36.0.zip
Adding upstream version 1.36.0.upstream/1.36.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am1
-rw-r--r--health/REFERENCE.md62
-rw-r--r--health/health.c112
-rw-r--r--health/health.d/cgroups.conf70
-rw-r--r--health/health.d/go.d.plugin.conf2
-rw-r--r--health/health.d/ml.conf36
-rw-r--r--health/health.d/python.d.plugin.conf2
-rw-r--r--health/health.d/ram.conf6
-rw-r--r--health/health.d/redis.conf7
-rw-r--r--health/health.d/web_log.conf214
-rw-r--r--health/health.h7
-rw-r--r--health/health_config.c42
-rw-r--r--health/health_json.c2
-rw-r--r--health/health_log.c32
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in8
-rw-r--r--health/notifications/msteams/README.md2
16 files changed, 301 insertions, 304 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index d5eb88468..777b35858 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -61,6 +61,7 @@ dist_healthconfig_DATA = \
health.d/megacli.conf \
health.d/memcached.conf \
health.d/memory.conf \
+ health.d/ml.conf \
health.d/mysql.conf \
health.d/net.conf \
health.d/netfilter.conf \
diff --git a/health/REFERENCE.md b/health/REFERENCE.md
index 3c1e53b2a..d1af74767 100644
--- a/health/REFERENCE.md
+++ b/health/REFERENCE.md
@@ -895,6 +895,68 @@ lookup: mean -10s of user
Since [`z = (x - mean) / stddev`](https://en.wikipedia.org/wiki/Standard_score) we create two input alarms, one for `mean` and one for `stddev` and then use them both as inputs in our final `cpu_user_zscore` alarm.
+### Example 8 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based CPU dimensions alarm
+
+Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) for any CPU dimension is above 5%, critical if it goes above 20%:
+
+```yaml
+template: ml_5min_cpu_dims
+ on: system.cpu
+ os: linux
+ hosts: *
+ lookup: average -5m anomaly-bit foreach *
+ calc: $this
+ units: %
+ every: 30s
+ warn: $this > (($status >= $WARNING) ? (5) : (20))
+ crit: $this > (($status == $CRITICAL) ? (20) : (100))
+ info: rolling 5min anomaly rate for each system.cpu dimension
+```
+
+The `lookup` line will calculate the average anomaly rate of each `system.cpu` dimension over the last 5 minues. In this case
+Netdata will create alarms for all dimensions of the chart.
+
+### Example 9 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based CPU chart alarm
+
+Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) averaged across all CPU dimensions is above 5%, critical if it goes above 20%:
+
+```yaml
+template: ml_5min_cpu_chart
+ on: system.cpu
+ os: linux
+ hosts: *
+ lookup: average -5m anomaly-bit of *
+ calc: $this
+ units: %
+ every: 30s
+ warn: $this > (($status >= $WARNING) ? (5) : (20))
+ crit: $this > (($status == $CRITICAL) ? (20) : (100))
+ info: rolling 5min anomaly rate for system.cpu chart
+```
+
+The `lookup` line will calculate the average anomaly rate across all `system.cpu` dimensions over the last 5 minues. In this case
+Netdata will create one alarm for the chart.
+
+### Example 10 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based node level alarm
+
+Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) averaged across all ML enabled dimensions is above 5%, critical if it goes above 20%:
+
+```yaml
+template: ml_5min_node
+ on: anomaly_detection.anomaly_rate
+ os: linux
+ hosts: *
+ lookup: average -5m of anomaly_rate
+ calc: $this
+ units: %
+ every: 30s
+ warn: $this > (($status >= $WARNING) ? (5) : (20))
+ crit: $this > (($status == $CRITICAL) ? (20) : (100))
+ info: rolling 5min anomaly rate for all ML enabled dims
+```
+
+The `lookup` line will use the `anomaly_rate` dimension of the `anomaly_detection.anomaly_rate` ML chart to calculate the average [node level anomaly rate](https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate) over the last 5 minues.
+
## Troubleshooting
You can compile Netdata with [debugging](/daemon/README.md#debugging) and then set in `netdata.conf`:
diff --git a/health/health.c b/health/health.c
index 3c1e5693e..9eb36a9c6 100644
--- a/health/health.c
+++ b/health/health.c
@@ -11,6 +11,12 @@ static struct {
ALARM_ENTRY *tail; // latest
} alarm_notifications_in_progress = {NULL, NULL};
+typedef struct active_alerts {
+ char *name;
+ time_t last_status_change;
+ RRDCALC_STATUS status;
+} active_alerts_t;
+
static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
{
ae->prev_in_progress = NULL;
@@ -219,10 +225,6 @@ static void health_reload_host(RRDHOST *host) {
* Reload the host configuration for all hosts.
*/
void health_reload(void) {
-#ifdef ENABLE_ACLK
- if (netdata_cloud_setting)
- aclk_single_update_disable();
-#endif
sql_refresh_hashes();
rrd_rdlock();
@@ -234,11 +236,7 @@ void health_reload(void) {
rrd_unlock();
#ifdef ENABLE_ACLK
if (netdata_cloud_setting) {
- aclk_single_update_enable();
- aclk_alarm_reload();
-#ifdef ENABLE_NEW_CLOUD_PROTOCOL
aclk_alert_reloaded = 1;
-#endif
}
#endif
}
@@ -246,13 +244,22 @@ void health_reload(void) {
// ----------------------------------------------------------------------------
// health main thread and friends
-static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) {
+static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
if(n) return RRDCALC_STATUS_RAISED;
return RRDCALC_STATUS_CLEAR;
}
#define ALARM_EXEC_COMMAND_LENGTH 8192
+#define ACTIVE_ALARMS_LIST_EXAMINE 500
+#define ACTIVE_ALARMS_LIST 15
+
+static inline int compare_active_alerts(const void * a, const void * b) {
+ active_alerts_t *active_alerts_a = (active_alerts_t *)a;
+ active_alerts_t *active_alerts_b = (active_alerts_t *)b;
+
+ return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
+}
static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
@@ -318,31 +325,28 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
RRDCALC *rc;
EVAL_EXPRESSION *expr=NULL;
BUFFER *warn_alarms, *crit_alarms;
+ active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- for(rc = host->alarms; rc ; rc = rc->next) {
+ for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- if (n_warn)
- buffer_strcat(warn_alarms, ",");
- buffer_strcat(warn_alarms, rc->name);
- buffer_strcat(warn_alarms, "=");
- buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
+ active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
+ active_alerts[n_warn+n_crit].status = rc->status;
n_warn++;
} else if (ae->alarm_id == rc->id)
expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- if (n_crit)
- buffer_strcat(crit_alarms, ",");
- buffer_strcat(crit_alarms, rc->name);
- buffer_strcat(crit_alarms, "=");
- buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
+ active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
+ active_alerts[n_warn+n_crit].status = rc->status;
n_crit++;
} else if (ae->alarm_id == rc->id)
expr = rc->critical;
@@ -352,9 +356,34 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
}
}
+ if (n_warn+n_crit>1)
+ qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
+
+ int count_w = 0, count_c = 0;
+ while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
+ if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
+ if (count_w)
+ buffer_strcat(warn_alarms, ",");
+ buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
+ buffer_strcat(warn_alarms, "=");
+ buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
+ count_w++;
+ }
+ else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
+ if (count_c)
+ buffer_strcat(crit_alarms, ",");
+ buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
+ buffer_strcat(crit_alarms, "=");
+ buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
+ count_c++;
+ }
+ }
+
char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
+ snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
+ "' '" NETDATA_DOUBLE_FORMAT_ZERO
+ "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
host->registry_hostname,
@@ -398,6 +427,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
freez(edit_command);
buffer_free(warn_alarms);
buffer_free(crit_alarms);
+ freez(active_alerts);
return; //health_alarm_wait_for_execution
done:
@@ -419,7 +449,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
}
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
- debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s",
+ debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
ae->chart?ae->chart:"NOCHART", ae->name,
ae->new_value,
rrdcalc_status2string(ae->old_status),
@@ -736,7 +766,7 @@ void *health_main(void *ptr) {
rrdcalc_labels_unlink();
unsigned int loop = 0;
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
unsigned int marked_aclk_reload_loop = 0;
#endif
while(!netdata_exit) {
@@ -765,7 +795,7 @@ void *health_main(void *ptr) {
}
}
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
if (aclk_alert_reloaded && !marked_aclk_reload_loop)
marked_aclk_reload_loop = loop;
#endif
@@ -795,6 +825,11 @@ void *health_main(void *ptr) {
host->health_delay_up_to = 0;
}
+ // wait until cleanup of obsolete charts on children is complete
+ if (host != localhost)
+ if (unlikely(host->trigger_chart_obsoletion_check == 1))
+ continue;
+
if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
sql_health_alarm_log_cleanup(host);
@@ -818,7 +853,7 @@ void *health_main(void *ptr) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
if (ae) {
@@ -828,7 +863,7 @@ void *health_main(void *ptr) {
rc->last_status_change = now;
rc->last_updated = now;
rc->value = NAN;
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
sql_queue_alarm_to_aclk(host, ae, 1);
#endif
@@ -855,10 +890,12 @@ void *health_main(void *ptr) {
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
- int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after,
- rc->before, rc->group, 0, rc->options, &rc->db_after,
- &rc->db_before, &value_is_null, 0
- );
+ int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
+ rc->after, rc->before, rc->group, NULL,
+ 0, rc->options,
+ &rc->db_after,&rc->db_before,
+ NULL, NULL, NULL,
+ &value_is_null, NULL, 0, 0);
if (unlikely(ret != 200)) {
// database lookup failed
@@ -898,8 +935,7 @@ void *health_main(void *ptr) {
} else
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value "
- CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
rc->value
);
}
@@ -923,7 +959,7 @@ void *health_main(void *ptr) {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
- CALCULATED_NUMBER_FORMAT
+ NETDATA_DOUBLE_FORMAT
": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
rc->calculation->parsed_as, rc->calculation->result,
buffer_tostring(rc->calculation->error_msg), rc->source
@@ -972,7 +1008,7 @@ void *health_main(void *ptr) {
} else {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
- CALCULATED_NUMBER_FORMAT
+ NETDATA_DOUBLE_FORMAT
": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
);
@@ -998,7 +1034,7 @@ void *health_main(void *ptr) {
} else {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
- CALCULATED_NUMBER_FORMAT
+ NETDATA_DOUBLE_FORMAT
": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
rc->source
@@ -1077,7 +1113,7 @@ void *health_main(void *ptr) {
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
rc->delay_last,
@@ -1129,7 +1165,7 @@ void *health_main(void *ptr) {
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
rc->delay_last,
@@ -1178,7 +1214,7 @@ void *health_main(void *ptr) {
health_alarm_wait_for_execution(ae);
}
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
rrdhost_foreach_read(host) {
if (unlikely(!host->health_enabled))
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index aa416c795..4bfe38b65 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -69,3 +69,73 @@ component: Network
info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
compared to the rate over the last minute
to: sysadmin
+
+# ---------------------------------K8s containers--------------------------------------------
+
+ template: k8s_cgroup_10min_cpu_usage
+ on: k8s.cgroup.cpu_limit
+ class: Utilization
+ type: Cgroups
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cgroup CPU utilization over the last 10 minutes
+ to: sysadmin
+
+ template: k8s_cgroup_ram_in_use
+ on: k8s.cgroup.mem_usage
+ class: Utilization
+ type: Cgroups
+component: Memory
+ os: linux
+ hosts: *
+ calc: ($ram) * 100 / $memory_limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: cgroup memory utilization
+ to: sysadmin
+
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: k8s_cgroup_1m_received_packets_rate
+ on: k8s.cgroup.net_packets
+ class: Workload
+ type: Cgroups
+component: Network
+ hosts: *
+ lookup: average -1m unaligned of received
+ units: packets
+ every: 10s
+ info: average number of packets received by the network interface $family over the last minute
+
+ template: k8s_cgroup_10s_received_packets_storm
+ on: k8s.cgroup.net_packets
+ class: Workload
+ type: Cgroups
+component: Network
+ hosts: *
+ lookup: average -10s unaligned of received
+ calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status == $CRITICAL)?(5000):(6000))
+ options: no-clear-notification
+ info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+ compared to the rate over the last minute
+ to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index 8bf84a976..a84ab342f 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -6,7 +6,7 @@
class: Error
type: Netdata
component: go.d.plugin
- module: *
+ module: !* *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
new file mode 100644
index 000000000..9bcc81e76
--- /dev/null
+++ b/health/health.d/ml.conf
@@ -0,0 +1,36 @@
+# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly
+# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's
+# native anomaly detection here:
+# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+
+# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+
+# alert per dimension example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_dims
+# on: system.cpu
+# os: linux
+# hosts: *
+# lookup: average -5m anomaly-bit foreach *
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (5) : (20))
+# crit: $this > (($status == $CRITICAL) ? (20) : (100))
+# info: rolling 5min anomaly rate for each system.cpu dimension
+
+# alert per chart example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_chart
+# on: system.cpu
+# os: linux
+# hosts: *
+# lookup: average -5m anomaly-bit of *
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (5) : (20))
+# crit: $this > (($status == $CRITICAL) ? (20) : (100))
+# info: rolling 5min anomaly rate for system.cpu chart \ No newline at end of file
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index f3abc588f..e3b3d11cf 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -6,7 +6,7 @@
class: Error
type: Netdata
component: python.d.plugin
- module: *
+ module: !* *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ff5f3ac17..ab382c43b 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -54,7 +54,7 @@ host labels: _is_k8s_node = false
component: Memory
os: freebsd
hosts: *
- calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+ calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -64,13 +64,13 @@ component: Memory
to: sysadmin
alarm: ram_available
- on: system.ram
+ on: mem.available
class: Utilization
type: System
component: Memory
os: freebsd
hosts: *
- calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index dfb771e8c..cad5230c5 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -6,7 +6,7 @@
type: KV Storage
component: Redis
every: 10s
- crit: $rdb_last_bgsave_status != 0
+ crit: $last_bgsave != nan AND $last_bgsave != 0
units: ok/failed
info: status of the last RDB save operation (0: ok, 1: error)
delay: down 5m multiplier 1.5 max 1h
@@ -19,8 +19,9 @@ component: Redis
type: KV Storage
component: Redis
every: 10s
- warn: $rdb_bgsave_in_progress > 600
- crit: $rdb_bgsave_in_progress > 1200
+ calc: $current_bgsave_time
+ warn: $this > 600
+ crit: $this > 1200
units: seconds
info: duration of the on-going RDB save operation
delay: down 5m multiplier 1.5 max 1h
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 454e0abef..c33c4664c 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -1,218 +1,4 @@
-# -----------------------------------------------------------------------------
-# high level response code alarms
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_requests
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned
- calc: ($this == 0)?(1):($this)
- units: requests
- every: 10s
- info: number of HTTP requests in the last minute
-
- template: 1m_successful
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of successful_requests
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
- to: webmaster
-
- template: 1m_redirects
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of redirects
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of redirection HTTP requests over the last minute (3xx except 304)
- to: webmaster
-
- template: 1m_bad_requests
- on: web_log.response_statuses
- class: Errors
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of bad_requests
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of client error HTTP requests over the last minute (4xx except 401)
- to: webmaster
-
- template: 1m_internal_errors
- on: web_log.response_statuses
- class: Errors
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of server_errors
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of server error HTTP requests over the last minute (5xx)
- to: webmaster
-
-# unmatched lines
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_total_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_total_requests
- on: web_log.response_codes
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned
- calc: ($this == 0)?(1):($this)
- units: requests
- every: 10s
- info: number of HTTP requests over the last minute
-
- template: 1m_unmatched
- on: web_log.response_codes
- class: Errors
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of unmatched
- calc: $this * 100 / $1m_total_requests
- units: %
- every: 10s
- warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
- delay: up 1m down 5m multiplier 1.5 max 1h
- info: percentage of unparsed log lines over the last minute
- to: webmaster
-
-# -----------------------------------------------------------------------------
-# web slow
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 10m_response_time
- on: web_log.response_time
- class: Latency
- type: System
-component: Web log
- families: *
- lookup: average -10m unaligned of avg
- units: ms
- every: 30s
- info: average HTTP response time over the last 10 minutes
-
- template: web_slow
- on: web_log.response_time
- class: Latency
- type: Web Server
-component: Web log
- families: *
- lookup: average -1m unaligned of avg
- units: ms
- every: 10s
- green: 500
- red: 1000
- warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
- delay: down 15m multiplier 1.5 max 1h
- info: average HTTP response time over the last minute
- options: no-clear-notification
- to: webmaster
-
-# -----------------------------------------------------------------------------
-# web too many or too few requests
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $5m_successful_old > 120
-#
-# i.e. when there were at least 120 requests during the 5 minutes starting
-# at -10m and ending at -5m
-
- template: 5m_successful_old
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: average -5m at -5m unaligned of successful_requests
- units: requests/s
- every: 30s
- info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
-
- template: 5m_successful
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: average -5m unaligned of successful_requests
- units: requests/s
- every: 30s
- info: average number of successful HTTP requests over the last 5 minutes
-
- template: 5m_requests_ratio
- on: web_log.response_codes
- class: Workload
- type: Web Server
-component: Web log
- families: *
- calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
- units: %
- every: 30s
- warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
- crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
- delay: down 15m multiplier 1.5 max 1h
- options: no-clear-notification
- info: ratio of successful HTTP requests over the last 5 minutes, \
- compared with the previous 5 minutes \
- (clear notification for this alarm will not be sent)
- to: webmaster
-
-
-
-# ---------------------------------------------------GO-VERSION---------------------------------------------------------
-
# unmatched lines
# the following alarms trigger only when there are enough data.
diff --git a/health/health.h b/health/health.h
index f25ae6bc6..3e77c12a7 100644
--- a/health/health.h
+++ b/health/health.h
@@ -35,7 +35,7 @@ extern void health_init(void);
extern void health_reload(void);
-extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result);
+extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, NETDATA_DOUBLE *result);
extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
@@ -56,6 +56,7 @@ extern ALARM_ENTRY* health_create_alarm_entry(
time_t when,
const char *name,
const char *chart,
+ const char *chart_context,
const char *family,
const char *classification,
const char *component,
@@ -63,8 +64,8 @@ extern ALARM_ENTRY* health_create_alarm_entry(
const char *exec,
const char *recipient,
time_t duration,
- calculated_number old_value,
- calculated_number new_value,
+ NETDATA_DOUBLE old_value,
+ NETDATA_DOUBLE new_value,
RRDCALC_STATUS old_status,
RRDCALC_STATUS new_status,
const char *source,
diff --git a/health/health_config.c b/health/health_config.c
index df6d7b609..e1dd32ab1 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -54,7 +54,9 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
- debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
+ debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
+ ", red " NETDATA_DOUBLE_FORMAT_AUTO
+ ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rc->chart?rc->chart:"NOCHART",
rc->name,
rc->id,
@@ -141,7 +143,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
}
}
- debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
+ debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
+ ", red " NETDATA_DOUBLE_FORMAT_AUTO
+ ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rt->name,
(rt->context)?rt->context:"NONE",
(rt->exec)?rt->exec:"DEFAULT",
@@ -848,7 +852,7 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
alert_cfg->green = strdupz(value);
char *e;
- rc->green = str2ld(value, &e);
+ rc->green = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
line, filename, rc->name, key, e);
@@ -857,7 +861,7 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
alert_cfg->red = strdupz(value);
char *e;
- rc->red = str2ld(value, &e);
+ rc->red = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
line, filename, rc->name, key, e);
@@ -955,17 +959,17 @@ static int health_readfile(const char *filename, void *data) {
}
else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
alert_cfg->host_labels = strdupz(value);
- if(rc->labels) {
- if(strcmp(rc->labels, value) != 0)
+ if(rc->host_labels) {
+ if(strcmp(rc->host_labels, value) != 0)
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
line, filename, rc->name, key, value, value);
- freez(rc->labels);
- simple_pattern_free(rc->splabels);
+ freez(rc->host_labels);
+ simple_pattern_free(rc->host_labels_pattern);
}
- rc->labels = simple_pattern_trim_around_equal(value);
- rc->splabels = simple_pattern_create(rc->labels, NULL, SIMPLE_PATTERN_EXACT);
+ rc->host_labels = simple_pattern_trim_around_equal(value);
+ rc->host_labels_pattern = simple_pattern_create(rc->host_labels, NULL, SIMPLE_PATTERN_EXACT);
}
else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) {
alert_cfg->plugin = strdupz(value);
@@ -1097,7 +1101,7 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
alert_cfg->green = strdupz(value);
char *e;
- rt->green = str2ld(value, &e);
+ rt->green = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
line, filename, rt->name, key, e);
@@ -1106,7 +1110,7 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
alert_cfg->red = strdupz(value);
char *e;
- rt->red = str2ld(value, &e);
+ rt->red = str2ndd(value, &e);
if(e && *e) {
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
line, filename, rt->name, key, e);
@@ -1204,17 +1208,17 @@ static int health_readfile(const char *filename, void *data) {
}
else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
alert_cfg->host_labels = strdupz(value);
- if(rt->labels) {
- if(strcmp(rt->labels, value) != 0)
+ if(rt->host_labels) {
+ if(strcmp(rt->host_labels, value) != 0)
error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
- line, filename, rt->name, key, rt->labels, value, value);
+ line, filename, rt->name, key, rt->host_labels, value, value);
- freez(rt->labels);
- simple_pattern_free(rt->splabels);
+ freez(rt->host_labels);
+ simple_pattern_free(rt->host_labels_pattern);
}
- rt->labels = simple_pattern_trim_around_equal(value);
- rt->splabels = simple_pattern_create(rt->labels, NULL, SIMPLE_PATTERN_EXACT);
+ rt->host_labels = simple_pattern_trim_around_equal(value);
+ rt->host_labels_pattern = simple_pattern_create(rt->host_labels, NULL, SIMPLE_PATTERN_EXACT);
}
else {
error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
diff --git a/health/health_json.c b/health/health_json.c
index d5285c11e..4e8f43761 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -29,6 +29,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
"\t\t\"config_hash_id\": \"%s\",\n"
"\t\t\"name\": \"%s\",\n"
"\t\t\"chart\": \"%s\",\n"
+ "\t\t\"context\": \"%s\",\n"
"\t\t\"family\": \"%s\",\n"
"\t\t\"class\": \"%s\",\n"
"\t\t\"component\": \"%s\",\n"
@@ -65,6 +66,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
, config_hash_id
, ae->name
, ae->chart
+ , ae->chart_context
, ae->family
, ae->classification?ae->classification:"Unknown"
, ae->component?ae->component:"Unknown"
diff --git a/health/health_log.c b/health/health_log.c
index 54f6dc9fc..f0a05531d 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -74,28 +74,15 @@ inline void health_label_log_save(RRDHOST *host) {
if(unlikely(host->health_log_fp)) {
BUFFER *wb = buffer_create(1024);
- rrdhost_check_rdlock(host);
- netdata_rwlock_rdlock(&host->labels.labels_rwlock);
- struct label *l=localhost->labels.head;
- while (l != NULL) {
- buffer_sprintf(wb,"%s=%s\t ", l->key, l->value);
- l = l->next;
- }
- netdata_rwlock_unlock(&host->labels.labels_rwlock);
-
- char *write = (char *) buffer_tostring(wb) ;
- write[wb->len-2] = '\n';
- write[wb->len-1] = '\0';
+ rrdlabels_to_buffer(localhost->host_labels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL);
+ char *write = (char *) buffer_tostring(wb);
- if (unlikely(fprintf(host->health_log_fp, "L\t%s"
- , write
- ) < 0))
+ if (unlikely(fprintf(host->health_log_fp, "L\t%s", write) < 0))
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.",
host->hostname, host->health_log_filename);
- else {
+ else
host->health_log_entries_written++;
- }
buffer_free(wb);
}
@@ -111,7 +98,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
"\t%08x\t%08x\t%08x"
"\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
"\t%d\t%d\t%d\t%d"
- "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
+ "\t" NETDATA_DOUBLE_FORMAT_AUTO "\t" NETDATA_DOUBLE_FORMAT_AUTO
"\t%016"PRIx64""
"\t%s\t%s\t%s"
"\n"
@@ -463,6 +450,7 @@ inline ALARM_ENTRY* health_create_alarm_entry(
time_t when,
const char *name,
const char *chart,
+ const char *chart_context,
const char *family,
const char *class,
const char *component,
@@ -470,8 +458,8 @@ inline ALARM_ENTRY* health_create_alarm_entry(
const char *exec,
const char *recipient,
time_t duration,
- calculated_number old_value,
- calculated_number new_value,
+ NETDATA_DOUBLE old_value,
+ NETDATA_DOUBLE new_value,
RRDCALC_STATUS old_status,
RRDCALC_STATUS new_status,
const char *source,
@@ -491,6 +479,9 @@ inline ALARM_ENTRY* health_create_alarm_entry(
ae->hash_chart = simple_hash(ae->chart);
}
+ if(chart_context)
+ ae->chart_context = strdupz(chart_context);
+
uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id));
if(family)
@@ -596,6 +587,7 @@ inline void health_alarm_log(
inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
freez(ae->name);
freez(ae->chart);
+ freez(ae->chart_context);
freez(ae->family);
freez(ae->classification);
freez(ae->component);
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 38a69a0f3..0dfecade5 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -2898,6 +2898,10 @@ if [ -n "$total_crit_alarms" ]; then
done <<<"$total_crit_alarms,"
fi
+if (( total_warnings + total_critical > 15 )); then
+ EXTRA_ALARMS_LIST_TEXT="(Showing latest 15 alerts)"
+fi
+
if [ -n "$edit_command_line" ]; then
IFS='=' read -r edit_command line s_host <<<"$edit_command_line"
fi
@@ -3423,6 +3427,10 @@ Content-Transfer-Encoding: 8bit
<span style="font-weight:600">${total_critical} critical</span>
additional active alert(s)</div>
</td>
+ </tr>
+ <td align="left" style="font-size:0px;padding:10px 25px;word-break:break-word;">
+ <div style="font-family:Open Sans, sans-serif;font-size:12px;line-height:1;text-align:center;color:#35414A;">${EXTRA_ALARMS_LIST_TEXT}</div>
+ </td>
</tr>
</tbody>
</table>
diff --git a/health/notifications/msteams/README.md b/health/notifications/msteams/README.md
index 14dbe7511..c9a13bac9 100644
--- a/health/notifications/msteams/README.md
+++ b/health/notifications/msteams/README.md
@@ -1,8 +1,6 @@
<!--
----
title: "Microsoft Teams"
custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/msteams/README.md
----
-->
# Microsoft Teams