Merging upstream version 1.36.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2022-08-12 07:26:17 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2022-08-12 07:26:17 +0000
commit: 7877a98bd9c00db5e81dd2f8c734cba2bab20be7 (patch)
tree: d18b767250f7c7ced9b8abe2ece784ac1fe24d3e /health/health.d
parent: Releasing debian version 1.35.1-2. (diff)
download: netdata-7877a98bd9c00db5e81dd2f8c734cba2bab20be7.tar.xz
netdata-7877a98bd9c00db5e81dd2f8c734cba2bab20be7.zip
7 files changed, 115 insertions, 222 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index aa416c79..4bfe38b6 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -69,3 +69,73 @@ component: Network
      info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
            compared to the rate over the last minute
        to: sysadmin
+
+# ---------------------------------K8s containers--------------------------------------------
+
+ template: k8s_cgroup_10min_cpu_usage
+       on: k8s.cgroup.cpu_limit
+    class: Utilization
+     type: Cgroups
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cgroup CPU utilization over the last 10 minutes
+       to: sysadmin
+
+ template: k8s_cgroup_ram_in_use
+       on: k8s.cgroup.mem_usage
+    class: Utilization
+     type: Cgroups
+component: Memory
+       os: linux
+    hosts: *
+     calc: ($ram) * 100 / $memory_limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: cgroup memory utilization
+       to: sysadmin
+
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: k8s_cgroup_1m_received_packets_rate
+       on: k8s.cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -1m unaligned of received
+    units: packets
+    every: 10s
+     info: average number of packets received by the network interface $family over the last minute
+
+ template: k8s_cgroup_10s_received_packets_storm
+       on: k8s.cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -10s unaligned of received
+     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(5000))
+     crit: $this > (($status == $CRITICAL)?(5000):(6000))
+  options: no-clear-notification
+     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute
+       to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index 8bf84a97..a84ab342 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -6,7 +6,7 @@
     class: Error
      type: Netdata
 component: go.d.plugin
-   module: *
+   module: !* *
      calc: $now - $last_collected_t
     units: seconds ago
     every: 10s
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
new file mode 100644
index 00000000..9bcc81e7
--- /dev/null
+++ b/health/health.d/ml.conf
@@ -0,0 +1,36 @@
+# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly 
+# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's 
+# native anomaly detection here: 
+# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+
+# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+
+# alert per dimension example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_dims
+#       on: system.cpu
+#       os: linux
+#    hosts: *
+#   lookup: average -5m anomaly-bit foreach *
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (5) : (20))
+#     crit: $this > (($status == $CRITICAL) ? (20) : (100))
+#     info: rolling 5min anomaly rate for each system.cpu dimension
+
+# alert per chart example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_chart
+#       on: system.cpu
+#       os: linux
+#    hosts: *
+#   lookup: average -5m anomaly-bit of *
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (5) : (20))
+#     crit: $this > (($status == $CRITICAL) ? (20) : (100))
+#     info: rolling 5min anomaly rate for system.cpu chart
+\ No newline at end of file
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index f3abc588..e3b3d11c 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -6,7 +6,7 @@
     class: Error
      type: Netdata
 component: python.d.plugin
-   module: *
+   module: !* *
      calc: $now - $last_collected_t
     units: seconds ago
     every: 10s
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ff5f3ac1..ab382c43 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -54,7 +54,7 @@ host labels: _is_k8s_node = false
 component: Memory
        os: freebsd
     hosts: *
-     calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+     calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
     units: %
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
@@ -64,13 +64,13 @@ component: Memory
        to: sysadmin
 
     alarm: ram_available
-       on: system.ram
+       on: mem.available
     class: Utilization
      type: System
 component: Memory
        os: freebsd
     hosts: *
-     calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+     calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
     units: %
     every: 10s
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index dfb771e8..cad5230c 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -6,7 +6,7 @@
      type: KV Storage
 component: Redis
     every: 10s
-     crit: $rdb_last_bgsave_status != 0
+     crit: $last_bgsave != nan AND $last_bgsave != 0
     units: ok/failed
      info: status of the last RDB save operation (0: ok, 1: error)
     delay: down 5m multiplier 1.5 max 1h
@@ -19,8 +19,9 @@ component: Redis
      type: KV Storage
 component: Redis
     every: 10s
-     warn: $rdb_bgsave_in_progress > 600
-     crit: $rdb_bgsave_in_progress > 1200
+     calc: $current_bgsave_time
+     warn: $this > 600
+     crit: $this > 1200
     units: seconds
      info: duration of the on-going RDB save operation
     delay: down 5m multiplier 1.5 max 1h
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 454e0abe..c33c4664 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -1,218 +1,4 @@
 
-# -----------------------------------------------------------------------------
-# high level response code alarms
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_requests
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned
-     calc: ($this == 0)?(1):($this)
-    units: requests
-    every: 10s
-     info: number of HTTP requests in the last minute
-
- template: 1m_successful
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of successful_requests
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
-     crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
-       to: webmaster
-
- template: 1m_redirects
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of redirects
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of redirection HTTP requests over the last minute (3xx except 304)
-       to: webmaster
-
- template: 1m_bad_requests
-       on: web_log.response_statuses
-    class: Errors
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of bad_requests
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of client error HTTP requests over the last minute (4xx except 401)
-       to: webmaster
-
- template: 1m_internal_errors
-       on: web_log.response_statuses
-    class: Errors
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of server_errors
-     calc: $this * 100 / $1m_requests
-    units: %
-    every: 10s
-     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
-     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
-    delay: up 2m down 15m multiplier 1.5 max 1h
-     info: ratio of server error HTTP requests over the last minute (5xx)
-       to: webmaster
-
-# unmatched lines
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_total_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_total_requests
-       on: web_log.response_codes
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned
-     calc: ($this == 0)?(1):($this)
-    units: requests
-    every: 10s
-     info: number of HTTP requests over the last minute
-
- template: 1m_unmatched
-       on: web_log.response_codes
-    class: Errors
-     type: Web Server
-component: Web log
- families: *
-   lookup: sum -1m unaligned of unmatched
-     calc: $this * 100 / $1m_total_requests
-    units: %
-    every: 10s
-     warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
-    delay: up 1m down 5m multiplier 1.5 max 1h
-     info: percentage of unparsed log lines over the last minute
-       to: webmaster
-
-# -----------------------------------------------------------------------------
-# web slow
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 10m_response_time
-       on: web_log.response_time
-    class: Latency
-     type: System
-component: Web log
- families: *
-   lookup: average -10m unaligned of avg
-    units: ms
-    every: 30s
-     info: average HTTP response time over the last 10 minutes
-
- template: web_slow
-       on: web_log.response_time
-    class: Latency
-     type: Web Server
-component: Web log
- families: *
-   lookup: average -1m unaligned of avg
-    units: ms
-    every: 10s
-    green: 500
-      red: 1000
-     warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
-     crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
-    delay: down 15m multiplier 1.5 max 1h
-     info: average HTTP response time over the last minute
-  options: no-clear-notification
-       to: webmaster
-
-# -----------------------------------------------------------------------------
-# web too many or too few requests
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-#  $5m_successful_old > 120
-#
-# i.e. when there were at least 120 requests during the 5 minutes starting
-#      at -10m and ending at -5m
-
- template: 5m_successful_old
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: average -5m at -5m unaligned of successful_requests
-    units: requests/s
-    every: 30s
-     info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
-
- template: 5m_successful
-       on: web_log.response_statuses
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-   lookup: average -5m unaligned of successful_requests
-    units: requests/s
-    every: 30s
-     info: average number of successful HTTP requests over the last 5 minutes
-
- template: 5m_requests_ratio
-       on: web_log.response_codes
-    class: Workload
-     type: Web Server
-component: Web log
- families: *
-     calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
-    units: %
-    every: 30s
-     warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
-     crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
-    delay: down 15m multiplier 1.5 max 1h
-  options: no-clear-notification
-     info: ratio of successful HTTP requests over the last 5 minutes, \
-           compared with the previous 5 minutes \
-           (clear notification for this alarm will not be sent)
-       to: webmaster
-
-
-
-# ---------------------------------------------------GO-VERSION---------------------------------------------------------
-
 # unmatched lines
 
 # the following alarms trigger only when there are enough data.
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2022-08-12 07:26:17 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2022-08-12 07:26:17 +0000
commit	7877a98bd9c00db5e81dd2f8c734cba2bab20be7 (patch)
tree	d18b767250f7c7ced9b8abe2ece784ac1fe24d3e /health/health.d
parent	Releasing debian version 1.35.1-2. (diff)
download	netdata-7877a98bd9c00db5e81dd2f8c734cba2bab20be7.tar.xz netdata-7877a98bd9c00db5e81dd2f8c734cba2bab20be7.zip