Adding upstream version 1.36.0.upstream/1.36.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2022-08-12 07:26:11 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2022-08-12 07:26:11 +0000
commit: 3c315f0fff93aa072472abc10815963ac0035268 (patch)
tree: a95f6a96e0e7bd139c010f8dc60b40e5b3062a99 /health
parent: Adding upstream version 1.35.1. (diff)
download: netdata-upstream/1.36.0.tar.xz
netdata-upstream/1.36.0.zip
16 files changed, 301 insertions, 304 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index d5eb88468..777b35858 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -61,6 +61,7 @@ dist_healthconfig_DATA = \
     health.d/megacli.conf \
     health.d/memcached.conf \
     health.d/memory.conf \
+    health.d/ml.conf \
     health.d/mysql.conf \
     health.d/net.conf \
     health.d/netfilter.conf \
diff --git a/health/REFERENCE.md b/health/REFERENCE.md
index 3c1e53b2a..d1af74767 100644
--- a/health/REFERENCE.md
+++ b/health/REFERENCE.md
@@ -895,6 +895,68 @@ lookup: mean -10s of user
 
 Since [`z = (x - mean) / stddev`](https://en.wikipedia.org/wiki/Standard_score) we create two input alarms, one for `mean` and one for `stddev` and then use them both as inputs in our final `cpu_user_zscore` alarm.
 
+### Example 8 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based CPU dimensions alarm
+
+Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) for any CPU dimension is above 5%, critical if it goes above 20%:
+
+```yaml
+template: ml_5min_cpu_dims
+      on: system.cpu
+      os: linux
+   hosts: *
+  lookup: average -5m anomaly-bit foreach *
+    calc: $this
+   units: %
+   every: 30s
+    warn: $this > (($status >= $WARNING)  ? (5) : (20))
+    crit: $this > (($status == $CRITICAL) ? (20) : (100))
+    info: rolling 5min anomaly rate for each system.cpu dimension
+```
+
+The `lookup` line will calculate the average anomaly rate of each `system.cpu` dimension over the last 5 minues. In this case
+Netdata will create alarms for all dimensions of the chart.
+
+### Example 9 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based CPU chart alarm
+
+Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) averaged across all CPU dimensions is above 5%, critical if it goes above 20%:
+
+```yaml
+template: ml_5min_cpu_chart
+      on: system.cpu
+      os: linux
+   hosts: *
+  lookup: average -5m anomaly-bit of *
+    calc: $this
+   units: %
+   every: 30s
+    warn: $this > (($status >= $WARNING)  ? (5) : (20))
+    crit: $this > (($status == $CRITICAL) ? (20) : (100))
+    info: rolling 5min anomaly rate for system.cpu chart
+```
+
+The `lookup` line will calculate the average anomaly rate across all `system.cpu` dimensions over the last 5 minues. In this case
+Netdata will create one alarm for the chart.
+
+### Example 10 - [Anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) based node level alarm
+
+Warning if 5 minute rolling [anomaly rate](https://learn.netdata.cloud/docs/agent/ml#anomaly-rate) averaged across all ML enabled dimensions is above 5%, critical if it goes above 20%:
+
+```yaml
+template: ml_5min_node
+      on: anomaly_detection.anomaly_rate
+      os: linux
+   hosts: *
+  lookup: average -5m of anomaly_rate
+    calc: $this
+   units: %
+   every: 30s
+    warn: $this > (($status >= $WARNING)  ? (5) : (20))
+    crit: $this > (($status == $CRITICAL) ? (20) : (100))
+    info: rolling 5min anomaly rate for all ML enabled dims
+```
+
+The `lookup` line will use the `anomaly_rate` dimension of the `anomaly_detection.anomaly_rate` ML chart to calculate the average [node level anomaly rate](https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate) over the last 5 minues.
+
 ## Troubleshooting
 
 You can compile Netdata with [debugging](/daemon/README.md#debugging) and then set in `netdata.conf`:
diff --git a/health/health.c b/health/health.c
index 3c1e5693e..9eb36a9c6 100644
--- a/health/health.c
+++ b/health/health.c
@@ -11,6 +11,12 @@ static struct {
     ALARM_ENTRY *tail; // latest
 } alarm_notifications_in_progress = {NULL, NULL};
 
+typedef struct active_alerts {
+    char *name;
+    time_t last_status_change;
+    RRDCALC_STATUS status;
+} active_alerts_t;
+
 static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
 {
     ae->prev_in_progress = NULL;
@@ -219,10 +225,6 @@ static void health_reload_host(RRDHOST *host) {
  * Reload the host configuration for all hosts.
  */
 void health_reload(void) {
-#ifdef ENABLE_ACLK
-    if (netdata_cloud_setting)
-        aclk_single_update_disable();
-#endif
     sql_refresh_hashes();
 
     rrd_rdlock();
@@ -234,11 +236,7 @@ void health_reload(void) {
     rrd_unlock();
 #ifdef ENABLE_ACLK
     if (netdata_cloud_setting) {
-        aclk_single_update_enable();
-        aclk_alarm_reload();
-#ifdef ENABLE_NEW_CLOUD_PROTOCOL
         aclk_alert_reloaded = 1;
-#endif
     }
 #endif
 }
@@ -246,13 +244,22 @@ void health_reload(void) {
 // ----------------------------------------------------------------------------
 // health main thread and friends
 
-static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) {
+static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
     if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
     if(n) return RRDCALC_STATUS_RAISED;
     return RRDCALC_STATUS_CLEAR;
 }
 
 #define ALARM_EXEC_COMMAND_LENGTH 8192
+#define ACTIVE_ALARMS_LIST_EXAMINE 500
+#define ACTIVE_ALARMS_LIST 15
+
+static inline int compare_active_alerts(const void * a, const void * b) {
+    active_alerts_t *active_alerts_a = (active_alerts_t *)a;
+    active_alerts_t *active_alerts_b = (active_alerts_t *)b;
+
+    return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
+}
 
 static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
     ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
@@ -318,31 +325,28 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
     RRDCALC *rc;
     EVAL_EXPRESSION *expr=NULL;
     BUFFER *warn_alarms, *crit_alarms;
+    active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
 
     warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
     crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
 
-    for(rc = host->alarms; rc ; rc = rc->next) {
+    for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
             continue;
 
         if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
             if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
-                if (n_warn)
-                    buffer_strcat(warn_alarms, ",");
-                buffer_strcat(warn_alarms, rc->name);
-                buffer_strcat(warn_alarms, "=");
-                buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
+                active_alerts[n_warn+n_crit].name = rc->name;
+                active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
+                active_alerts[n_warn+n_crit].status = rc->status;
                 n_warn++;
             } else if (ae->alarm_id == rc->id)
                 expr = rc->warning;
         } else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
             if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
-                if (n_crit)
-                    buffer_strcat(crit_alarms, ",");
-                buffer_strcat(crit_alarms, rc->name);
-                buffer_strcat(crit_alarms, "=");
-                buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
+                active_alerts[n_warn+n_crit].name = rc->name;
+                active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
+                active_alerts[n_warn+n_crit].status = rc->status;
                 n_crit++;
             } else if (ae->alarm_id == rc->id)
                 expr = rc->critical;
@@ -352,9 +356,34 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
         }
     }
 
+    if (n_warn+n_crit>1)
+        qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
+
+    int count_w = 0, count_c = 0;
+    while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
+        if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
+            if (count_w)
+                buffer_strcat(warn_alarms, ",");
+            buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
+            buffer_strcat(warn_alarms, "=");
+            buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
+            count_w++;
+        }
+        else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
+            if (count_c)
+                buffer_strcat(crit_alarms, ",");
+            buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
+            buffer_strcat(crit_alarms, "=");
+            buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
+            count_c++;
+        }
+    }
+
     char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
 
-    snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
+    snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
+        "' '" NETDATA_DOUBLE_FORMAT_ZERO
+        "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
               exec,
               recipient,
               host->registry_hostname,
@@ -398,6 +427,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
     freez(edit_command);
     buffer_free(warn_alarms);
     buffer_free(crit_alarms);
+    freez(active_alerts);
 
     return; //health_alarm_wait_for_execution
 done:
@@ -419,7 +449,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
 }
 
 static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
-    debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s",
+    debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
          ae->chart?ae->chart:"NOCHART", ae->name,
          ae->new_value,
          rrdcalc_status2string(ae->old_status),
@@ -736,7 +766,7 @@ void *health_main(void *ptr) {
     rrdcalc_labels_unlink();
 
     unsigned int loop = 0;
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
     unsigned int marked_aclk_reload_loop = 0;
 #endif
     while(!netdata_exit) {
@@ -765,7 +795,7 @@ void *health_main(void *ptr) {
             }
         }
 
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
         if (aclk_alert_reloaded && !marked_aclk_reload_loop)
             marked_aclk_reload_loop = loop;
 #endif
@@ -795,6 +825,11 @@ void *health_main(void *ptr) {
                 host->health_delay_up_to = 0;
             }
 
+            // wait until cleanup of obsolete charts on children is complete
+            if (host != localhost)
+                if (unlikely(host->trigger_chart_obsoletion_check == 1))
+                    continue;
+
             if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
                 sql_health_alarm_log_cleanup(host);
 
@@ -818,7 +853,7 @@ void *health_main(void *ptr) {
                         worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
                         time_t now = now_realtime_sec();
                         ALARM_ENTRY *ae = health_create_alarm_entry(
-                            host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+                            host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
                             rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
                             rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
                         if (ae) {
@@ -828,7 +863,7 @@ void *health_main(void *ptr) {
                             rc->last_status_change = now;
                             rc->last_updated = now;
                             rc->value = NAN;
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
                             if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
                                 sql_queue_alarm_to_aclk(host, ae, 1);
 #endif
@@ -855,10 +890,12 @@ void *health_main(void *ptr) {
                     /* time_t old_db_timestamp = rc->db_before; */
                     int value_is_null = 0;
 
-                    int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after,
-                                                  rc->before, rc->group, 0, rc->options, &rc->db_after,
-                                                  &rc->db_before, &value_is_null, 0
-                    );
+                    int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
+                                                  rc->after, rc->before, rc->group, NULL,
+                                                  0, rc->options,
+                                                  &rc->db_after,&rc->db_before,
+                                                  NULL, NULL, NULL,
+                                                  &value_is_null, NULL, 0, 0);
 
                     if (unlikely(ret != 200)) {
                         // database lookup failed
@@ -898,8 +935,7 @@ void *health_main(void *ptr) {
                     } else
                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
 
-                    debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value "
-                          CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+                    debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
                           rc->value
                     );
                 }
@@ -923,7 +959,7 @@ void *health_main(void *ptr) {
                         rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
 
                         debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
-                              CALCULATED_NUMBER_FORMAT
+                              NETDATA_DOUBLE_FORMAT
                               ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
                               rc->calculation->parsed_as, rc->calculation->result,
                               buffer_tostring(rc->calculation->error_msg), rc->source
@@ -972,7 +1008,7 @@ void *health_main(void *ptr) {
                         } else {
                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
-                                  CALCULATED_NUMBER_FORMAT
+                                  NETDATA_DOUBLE_FORMAT
                                   ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
                                   rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
                             );
@@ -998,7 +1034,7 @@ void *health_main(void *ptr) {
                         } else {
                             rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
                             debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
-                                  CALCULATED_NUMBER_FORMAT
+                                  NETDATA_DOUBLE_FORMAT
                                   ": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
                                   rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
                                   rc->source
@@ -1077,7 +1113,7 @@ void *health_main(void *ptr) {
 
 
                         ALARM_ENTRY *ae = health_create_alarm_entry(
-                                host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+                                host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
                                 rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
                                 rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
                                 rc->delay_last,
@@ -1129,7 +1165,7 @@ void *health_main(void *ptr) {
                         rc->last_repeat = now;
                         if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
                         ALARM_ENTRY *ae = health_create_alarm_entry(
-                                host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+                                host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
                                 rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
                                 rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
                                 rc->delay_last,
@@ -1178,7 +1214,7 @@ void *health_main(void *ptr) {
             health_alarm_wait_for_execution(ae);
         }
 
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
         if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
                 rrdhost_foreach_read(host) {
                     if (unlikely(!host->health_enabled))
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index aa416c795..4bfe38b65 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -69,3 +69,73 @@ component: Network
      info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
            compared to the rate over the last minute
        to: sysadmin
+
+# ---------------------------------K8s containers--------------------------------------------
+
+ template: k8s_cgroup_10min_cpu_usage
+       on: k8s.cgroup.cpu_limit
+    class: Utilization
+     type: Cgroups
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cgroup CPU utilization over the last 10 minutes
+       to: sysadmin
+
+ template: k8s_cgroup_ram_in_use
+       on: k8s.cgroup.mem_usage
+    class: Utilization
+     type: Cgroups
+component: Memory
+       os: linux
+    hosts: *
+     calc: ($ram) * 100 / $memory_limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: cgroup memory utilization
+       to: sysadmin
+
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: k8s_cgroup_1m_received_packets_rate
+       on: k8s.cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -1m unaligned of received
+    units: packets
+    every: 10s
+     info: average number of packets received by the network interface $family over the last minute
+
+ template: k8s_cgroup_10s_received_packets_storm
+       on: k8s.cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -10s unaligned of received
+     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(5000))
+     crit: $this > (($status == $CRITICAL)?(5000):(6000))
+  options: no-clear-notification
+     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute
+       to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index 8bf84a976..a84ab342f 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -6,7 +6,7 @@
     class: Error
      type: Netdata
 component: go.d.plugin
-   module: *
+   module: !* *
      calc: $now - $last_collected_t
     units: seconds ago
     every: 10s
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
new file mode 100644
index 000000000..9bcc81e76
--- /dev/null
+++ b/health/health.d/ml.conf
@@ -0,0 +1,36 @@
+# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly 
+# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's 
+# native anomaly detection here: 
+# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+
+# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+
+# alert per dimension example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_dims
+#       on: system.cpu
+#       os: linux
+#    hosts: *
+#   lookup: average -5m anomaly-bit foreach *
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (5) : (20))
+#     crit: $this > (($status == $CRITICAL) ? (20) : (100))
+#     info: rolling 5min anomaly rate for each system.cpu dimension
+
+# alert per chart example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_chart
+#       on: system.cpu
+#       os: linux
+#    hosts: *
+#   lookup: average -5m anomaly-bit of *
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (5) : (20))
+#     crit: $this > (($status == $CRITICAL) ? (20) : (100))
+#     info: rolling 5min anomaly rate for system.cpu chart
+\ No newline at end of file
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index f3abc588f..e3b3d11cf 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -6,7 +6,7 @@
     class: Error
      type: Netdata
 component: python.d.plugin
-   module: *
+   module: !* *
      calc: $now - $last_collected_t
     units: seconds ago
     every: 10s
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ff5f3ac17..ab382c43b 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -54,7 +54,7 @@ host labels: _is_k8s_node = false
 component: Memory
        os: freebsd
     hosts: *
-     calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+     calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
@@ -64,13 +64,13 @@ component: Memory
        to: sysadmin
 
     alarm: ram_available
-       on: system.ram
+       on: mem.available
     class: Utilization
      type: System
 component: Memory
        os: freebsd
     hosts: *
-     calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+     calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
     units: %
     every: 10s
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index dfb771e8c..cad5230c5 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -6,7 +6,7 @@
      type: KV Storage
 component: Redis
     every: 10s
-     crit: $rdb_last_bgsave_status != 0
+     crit: $last_bgsave != nan AND $last_bgsave != 0
     units: ok/failed
      info: status of the last RDB save operation (0: ok, 1: error)
     delay: down 5m multiplier 1.5 max 1h
@@ -19,8 +19,9 @@ component: Redis
      type: KV Storage
 component: Redis
     every: 10s
-     warn: $rdb_bgsave_in_progress > 600
-     crit: $rdb_bgsave_in_progress > 1200
+     calc: $current_bgsave_time
+     warn: $this > 600
+     crit: $this > 1200
     units: seconds
      info: duration of the on-going RDB save operation
     delay: down 5m multiplier 1.5 max 1h
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 454e0abef..c33c4664c 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -1,218 +1,4 @@
 
-# -----------------------------------------------------------------------------
-# high level response code alarms
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_requests
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned
-     calc: ($this == 0)?(1):($this)
-    units: requests
-    every: 10s
-     info: number of HTTP requests in the last minute
-
- template: 1m_successful
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of successful_requests
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
-     crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
-       to: webmaster
-
- template: 1m_redirects
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of redirects
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of redirection HTTP requests over the last minute (3xx except 304)
-       to: webmaster
-
- template: 1m_bad_requests
-       on: web_log.response_statuses
-    class: Errors
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of bad_requests
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of client error HTTP requests over the last minute (4xx except 401)
-       to: webmaster
-
- template: 1m_internal_errors
-       on: web_log.response_statuses
-    class: Errors
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of server_errors
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
-     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of server error HTTP requests over the last minute (5xx)
-       to: webmaster
-
-# unmatched lines
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_total_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_total_requests
-       on: web_log.response_codes
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned
-     calc: ($this == 0)?(1):($this)
-    units: requests
-    every: 10s
-     info: number of HTTP requests over the last minute
-
- template: 1m_unmatched
-       on: web_log.response_codes
-    class: Errors
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of unmatched
-     calc: $this * 100 / $1m_total_requests
-    units: %
-    every: 10s
-     warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
-    delay: up 1m down 5m multiplier 1.5 max 1h
-     info: percentage of unparsed log lines over the last minute
-       to: webmaster
-
-# -----------------------------------------------------------------------------
-# web slow
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 10m_response_time
-       on: web_log.response_time
-    class: Latency
-     type: System
-component: Web log
- families: *
-   lookup: average -10m unaligned of avg
-    units: ms
-    every: 30s
-     info: average HTTP response time over the last 10 minutes
-
- template: web_slow
-       on: web_log.response_time
-    class: Latency
-     type: Web Server
-component: Web log
- families: *
-   lookup: average -1m unaligned of avg
-    units: ms
-    every: 10s
-    green: 500
-      red: 1000
-     warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
-     crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
-    delay: down 15m multiplier 1.5 max 1h
-     info: average HTTP response time over the last minute
-  options: no-clear-notification
-       to: webmaster
-
-# -----------------------------------------------------------------------------
-# web too many or too few requests
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $5m_successful_old > 120
-#
-# i.e. when there were at least 120 requests during the 5 minutes starting
-#      at -10m and ending at -5m
-
- template: 5m_successful_old
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: average -5m at -5m unaligned of successful_requests
-    units: requests/s
-    every: 30s
-     info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
-
- template: 5m_successful
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: average -5m unaligned of successful_requests
-    units: requests/s
-    every: 30s
-     info: average number of successful HTTP requests over the last 5 minutes
-
- template: 5m_requests_ratio
-       on: web_log.response_codes
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-     calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
-    units: %
-    every: 30s
-     warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
-     crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
-    delay: down 15m multiplier 1.5 max 1h
-  options: no-clear-notification
-     info: ratio of successful HTTP requests over the last 5 minutes, \
-           compared with the previous 5 minutes \
-           (clear notification for this alarm will not be sent)
-       to: webmaster
-
-
-
-# ---------------------------------------------------GO-VERSION---------------------------------------------------------
-
 # unmatched lines
 
 # the following alarms trigger only when there are enough data.
diff --git a/health/health.h b/health/health.h
index f25ae6bc6..3e77c12a7 100644
--- a/health/health.h
+++ b/health/health.h
@@ -35,7 +35,7 @@ extern void health_init(void);
 
 extern void health_reload(void);
 
-extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result);
+extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, NETDATA_DOUBLE *result);
 extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
 extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
 extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
@@ -56,6 +56,7 @@ extern ALARM_ENTRY* health_create_alarm_entry(
         time_t when,
         const char *name,
         const char *chart,
+        const char *chart_context,
         const char *family,
         const char *classification,
         const char *component,
@@ -63,8 +64,8 @@ extern ALARM_ENTRY* health_create_alarm_entry(
         const char *exec,
         const char *recipient,
         time_t duration,
-        calculated_number old_value,
-        calculated_number new_value,
+        NETDATA_DOUBLE old_value,
+        NETDATA_DOUBLE new_value,
         RRDCALC_STATUS old_status,
         RRDCALC_STATUS new_status,
         const char *source,
diff --git a/health/health_config.c b/health/health_config.c
index df6d7b609..e1dd32ab1 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -54,7 +54,9 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
 
     rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
 
-    debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
+    debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
+        ", red " NETDATA_DOUBLE_FORMAT_AUTO
+        ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
             rc->chart?rc->chart:"NOCHART",
             rc->name,
             rc->id,
@@ -141,7 +143,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
         }
     }
 
-    debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
+    debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " NETDATA_DOUBLE_FORMAT_AUTO
+        ", red " NETDATA_DOUBLE_FORMAT_AUTO
+        ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', for each dimension '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
           rt->name,
           (rt->context)?rt->context:"NONE",
           (rt->exec)?rt->exec:"DEFAULT",
@@ -848,7 +852,7 @@ static int health_readfile(const char *filename, void *data) {
             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
                 alert_cfg->green = strdupz(value);
                 char *e;
-                rc->green = str2ld(value, &e);
+                rc->green = str2ndd(value, &e);
                 if(e && *e) {
                     error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
                             line, filename, rc->name, key, e);
@@ -857,7 +861,7 @@ static int health_readfile(const char *filename, void *data) {
             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
                 alert_cfg->red = strdupz(value);
                 char *e;
-                rc->red = str2ld(value, &e);
+                rc->red = str2ndd(value, &e);
                 if(e && *e) {
                     error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' leaves this string unmatched: '%s'.",
                             line, filename, rc->name, key, e);
@@ -955,17 +959,17 @@ static int health_readfile(const char *filename, void *data) {
             }
             else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
                 alert_cfg->host_labels = strdupz(value);
-                if(rc->labels) {
-                    if(strcmp(rc->labels, value) != 0)
+                if(rc->host_labels) {
+                    if(strcmp(rc->host_labels, value) != 0)
                         error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'.",
                               line, filename, rc->name, key, value, value);
 
-                    freez(rc->labels);
-                    simple_pattern_free(rc->splabels);
+                    freez(rc->host_labels);
+                    simple_pattern_free(rc->host_labels_pattern);
                 }
 
-                rc->labels = simple_pattern_trim_around_equal(value);
-                rc->splabels = simple_pattern_create(rc->labels, NULL, SIMPLE_PATTERN_EXACT);
+                rc->host_labels = simple_pattern_trim_around_equal(value);
+                rc->host_labels_pattern = simple_pattern_create(rc->host_labels, NULL, SIMPLE_PATTERN_EXACT);
             }
             else if(hash == hash_plugin && !strcasecmp(key, HEALTH_PLUGIN_KEY)) {
                 alert_cfg->plugin = strdupz(value);
@@ -1097,7 +1101,7 @@ static int health_readfile(const char *filename, void *data) {
             else if(hash == hash_green && !strcasecmp(key, HEALTH_GREEN_KEY)) {
                 alert_cfg->green = strdupz(value);
                 char *e;
-                rt->green = str2ld(value, &e);
+                rt->green = str2ndd(value, &e);
                 if(e && *e) {
                     error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
                             line, filename, rt->name, key, e);
@@ -1106,7 +1110,7 @@ static int health_readfile(const char *filename, void *data) {
             else if(hash == hash_red && !strcasecmp(key, HEALTH_RED_KEY)) {
                 alert_cfg->red = strdupz(value);
                 char *e;
-                rt->red = str2ld(value, &e);
+                rt->red = str2ndd(value, &e);
                 if(e && *e) {
                     error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' leaves this string unmatched: '%s'.",
                             line, filename, rt->name, key, e);
@@ -1204,17 +1208,17 @@ static int health_readfile(const char *filename, void *data) {
             }
             else if(hash == hash_host_label && !strcasecmp(key, HEALTH_HOST_LABEL_KEY)) {
                 alert_cfg->host_labels = strdupz(value);
-                if(rt->labels) {
-                    if(strcmp(rt->labels, value) != 0)
+                if(rt->host_labels) {
+                    if(strcmp(rt->host_labels, value) != 0)
                         error("Health configuration at line %zu of file '%s' for template '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
-                              line, filename, rt->name, key, rt->labels, value, value);
+                              line, filename, rt->name, key, rt->host_labels, value, value);
 
-                    freez(rt->labels);
-                    simple_pattern_free(rt->splabels);
+                    freez(rt->host_labels);
+                    simple_pattern_free(rt->host_labels_pattern);
                 }
 
-                rt->labels = simple_pattern_trim_around_equal(value);
-                rt->splabels = simple_pattern_create(rt->labels, NULL, SIMPLE_PATTERN_EXACT);
+                rt->host_labels = simple_pattern_trim_around_equal(value);
+                rt->host_labels_pattern = simple_pattern_create(rt->host_labels, NULL, SIMPLE_PATTERN_EXACT);
             }
             else {
                 error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
diff --git a/health/health_json.c b/health/health_json.c
index d5285c11e..4e8f43761 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -29,6 +29,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
                     "\t\t\"config_hash_id\": \"%s\",\n"
                     "\t\t\"name\": \"%s\",\n"
                     "\t\t\"chart\": \"%s\",\n"
+                    "\t\t\"context\": \"%s\",\n"
                     "\t\t\"family\": \"%s\",\n"
                     "\t\t\"class\": \"%s\",\n"
                     "\t\t\"component\": \"%s\",\n"
@@ -65,6 +66,7 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
                    , config_hash_id
                    , ae->name
                    , ae->chart
+                   , ae->chart_context
                    , ae->family
                    , ae->classification?ae->classification:"Unknown"
                    , ae->component?ae->component:"Unknown"
diff --git a/health/health_log.c b/health/health_log.c
index 54f6dc9fc..f0a05531d 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -74,28 +74,15 @@ inline void health_label_log_save(RRDHOST *host) {
 
     if(unlikely(host->health_log_fp)) {
         BUFFER *wb = buffer_create(1024);
-        rrdhost_check_rdlock(host);
-        netdata_rwlock_rdlock(&host->labels.labels_rwlock);
-        struct label *l=localhost->labels.head;
-        while (l != NULL) {
-            buffer_sprintf(wb,"%s=%s\t ", l->key, l->value);
-            l = l->next;
-        }
-        netdata_rwlock_unlock(&host->labels.labels_rwlock);
-
-        char *write = (char *) buffer_tostring(wb) ;
 
-        write[wb->len-2] = '\n';
-        write[wb->len-1] = '\0';
+        rrdlabels_to_buffer(localhost->host_labels, wb, "", "=", "", "\t ", NULL, NULL, NULL, NULL);
+        char *write = (char *) buffer_tostring(wb);
 
-        if (unlikely(fprintf(host->health_log_fp, "L\t%s"
-                , write
-        ) < 0))
+        if (unlikely(fprintf(host->health_log_fp, "L\t%s", write) < 0))
             error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.",
                   host->hostname, host->health_log_filename);
-        else {
+        else
             host->health_log_entries_written++;
-        }
 
         buffer_free(wb);
     }
@@ -111,7 +98,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
                         "\t%08x\t%08x\t%08x"
                         "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
                         "\t%d\t%d\t%d\t%d"
-                        "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
+                        "\t" NETDATA_DOUBLE_FORMAT_AUTO "\t" NETDATA_DOUBLE_FORMAT_AUTO
                         "\t%016"PRIx64""
                         "\t%s\t%s\t%s"
                         "\n"
@@ -463,6 +450,7 @@ inline ALARM_ENTRY* health_create_alarm_entry(
         time_t when,
         const char *name,
         const char *chart,
+        const char *chart_context,
         const char *family,
         const char *class,
         const char *component,
@@ -470,8 +458,8 @@ inline ALARM_ENTRY* health_create_alarm_entry(
         const char *exec,
         const char *recipient,
         time_t duration,
-        calculated_number old_value,
-        calculated_number new_value,
+        NETDATA_DOUBLE old_value,
+        NETDATA_DOUBLE new_value,
         RRDCALC_STATUS old_status,
         RRDCALC_STATUS new_status,
         const char *source,
@@ -491,6 +479,9 @@ inline ALARM_ENTRY* health_create_alarm_entry(
         ae->hash_chart = simple_hash(ae->chart);
     }
 
+    if(chart_context)
+        ae->chart_context = strdupz(chart_context);
+
     uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id));
 
     if(family)
@@ -596,6 +587,7 @@ inline void health_alarm_log(
 inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
     freez(ae->name);
     freez(ae->chart);
+    freez(ae->chart_context);
     freez(ae->family);
     freez(ae->classification);
     freez(ae->component);
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 38a69a0f3..0dfecade5 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -2898,6 +2898,10 @@ if [ -n "$total_crit_alarms" ]; then
    done <<<"$total_crit_alarms,"
 fi
 
+if (( total_warnings + total_critical > 15 )); then
+    EXTRA_ALARMS_LIST_TEXT="(Showing latest 15 alerts)"
+fi
+
 if [ -n "$edit_command_line" ]; then
     IFS='=' read -r edit_command line s_host <<<"$edit_command_line"
 fi
@@ -3423,6 +3427,10 @@ Content-Transfer-Encoding: 8bit
                             <span style="font-weight:600">${total_critical} critical</span>
                             additional active alert(s)</div>
                         </td>
+                        </tr>
+                        <td align="left" style="font-size:0px;padding:10px 25px;word-break:break-word;">
+                          <div style="font-family:Open Sans, sans-serif;font-size:12px;line-height:1;text-align:center;color:#35414A;">${EXTRA_ALARMS_LIST_TEXT}</div>
+                        </td>
                       </tr>
                       </tbody>
                     </table>
diff --git a/health/notifications/msteams/README.md b/health/notifications/msteams/README.md
index 14dbe7511..c9a13bac9 100644
--- a/health/notifications/msteams/README.md
+++ b/health/notifications/msteams/README.md
@@ -1,8 +1,6 @@
 <!--
----
 title: "Microsoft Teams"
 custom_edit_url: https://github.com/netdata/netdata/edit/master/health/notifications/msteams/README.md
----
 -->
 
 # Microsoft Teams
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2022-08-12 07:26:11 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2022-08-12 07:26:11 +0000
commit	3c315f0fff93aa072472abc10815963ac0035268 (patch)
tree	a95f6a96e0e7bd139c010f8dc60b40e5b3062a99 /health
parent	Adding upstream version 1.35.1. (diff)
download	netdata-upstream/1.36.0.tar.xz netdata-upstream/1.36.0.zip