summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/cgroups.conf70
-rw-r--r--health/health.d/go.d.plugin.conf2
-rw-r--r--health/health.d/ml.conf36
-rw-r--r--health/health.d/python.d.plugin.conf2
-rw-r--r--health/health.d/ram.conf6
-rw-r--r--health/health.d/redis.conf7
-rw-r--r--health/health.d/web_log.conf214
7 files changed, 115 insertions, 222 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index aa416c795..4bfe38b65 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -69,3 +69,73 @@ component: Network
info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
compared to the rate over the last minute
to: sysadmin
+
+# ---------------------------------K8s containers--------------------------------------------
+
+ template: k8s_cgroup_10min_cpu_usage
+ on: k8s.cgroup.cpu_limit
+ class: Utilization
+ type: Cgroups
+component: CPU
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cgroup CPU utilization over the last 10 minutes
+ to: sysadmin
+
+ template: k8s_cgroup_ram_in_use
+ on: k8s.cgroup.mem_usage
+ class: Utilization
+ type: Cgroups
+component: Memory
+ os: linux
+ hosts: *
+ calc: ($ram) * 100 / $memory_limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: cgroup memory utilization
+ to: sysadmin
+
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: k8s_cgroup_1m_received_packets_rate
+ on: k8s.cgroup.net_packets
+ class: Workload
+ type: Cgroups
+component: Network
+ hosts: *
+ lookup: average -1m unaligned of received
+ units: packets
+ every: 10s
+ info: average number of packets received by the network interface $family over the last minute
+
+ template: k8s_cgroup_10s_received_packets_storm
+ on: k8s.cgroup.net_packets
+ class: Workload
+ type: Cgroups
+component: Network
+ hosts: *
+ lookup: average -10s unaligned of received
+ calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status == $CRITICAL)?(5000):(6000))
+ options: no-clear-notification
+ info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+ compared to the rate over the last minute
+ to: sysadmin
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
index 8bf84a976..a84ab342f 100644
--- a/health/health.d/go.d.plugin.conf
+++ b/health/health.d/go.d.plugin.conf
@@ -6,7 +6,7 @@
class: Error
type: Netdata
component: go.d.plugin
- module: *
+ module: !* *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
new file mode 100644
index 000000000..9bcc81e76
--- /dev/null
+++ b/health/health.d/ml.conf
@@ -0,0 +1,36 @@
+# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly
+# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's
+# native anomaly detection here:
+# https://learn.netdata.cloud/docs/configure/machine-learning#anomaly-bit---100--anomalous-0--normal
+
+# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+
+# alert per dimension example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_dims
+# on: system.cpu
+# os: linux
+# hosts: *
+# lookup: average -5m anomaly-bit foreach *
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (5) : (20))
+# crit: $this > (($status == $CRITICAL) ? (20) : (100))
+# info: rolling 5min anomaly rate for each system.cpu dimension
+
+# alert per chart example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_chart
+# on: system.cpu
+# os: linux
+# hosts: *
+# lookup: average -5m anomaly-bit of *
+# calc: $this
+# units: %
+# every: 30s
+# warn: $this > (($status >= $WARNING) ? (5) : (20))
+# crit: $this > (($status == $CRITICAL) ? (20) : (100))
+# info: rolling 5min anomaly rate for system.cpu chart \ No newline at end of file
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
index f3abc588f..e3b3d11cf 100644
--- a/health/health.d/python.d.plugin.conf
+++ b/health/health.d/python.d.plugin.conf
@@ -6,7 +6,7 @@
class: Error
type: Netdata
component: python.d.plugin
- module: *
+ module: !* *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ff5f3ac17..ab382c43b 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -54,7 +54,7 @@ host labels: _is_k8s_node = false
component: Memory
os: freebsd
hosts: *
- calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+ calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -64,13 +64,13 @@ component: Memory
to: sysadmin
alarm: ram_available
- on: system.ram
+ on: mem.available
class: Utilization
type: System
component: Memory
os: freebsd
hosts: *
- calc: ($free + $inactive + $cache) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index dfb771e8c..cad5230c5 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -6,7 +6,7 @@
type: KV Storage
component: Redis
every: 10s
- crit: $rdb_last_bgsave_status != 0
+ crit: $last_bgsave != nan AND $last_bgsave != 0
units: ok/failed
info: status of the last RDB save operation (0: ok, 1: error)
delay: down 5m multiplier 1.5 max 1h
@@ -19,8 +19,9 @@ component: Redis
type: KV Storage
component: Redis
every: 10s
- warn: $rdb_bgsave_in_progress > 600
- crit: $rdb_bgsave_in_progress > 1200
+ calc: $current_bgsave_time
+ warn: $this > 600
+ crit: $this > 1200
units: seconds
info: duration of the on-going RDB save operation
delay: down 5m multiplier 1.5 max 1h
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 454e0abef..c33c4664c 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -1,218 +1,4 @@
-# -----------------------------------------------------------------------------
-# high level response code alarms
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_requests
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned
- calc: ($this == 0)?(1):($this)
- units: requests
- every: 10s
- info: number of HTTP requests in the last minute
-
- template: 1m_successful
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of successful_requests
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
- to: webmaster
-
- template: 1m_redirects
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of redirects
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of redirection HTTP requests over the last minute (3xx except 304)
- to: webmaster
-
- template: 1m_bad_requests
- on: web_log.response_statuses
- class: Errors
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of bad_requests
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of client error HTTP requests over the last minute (4xx except 401)
- to: webmaster
-
- template: 1m_internal_errors
- on: web_log.response_statuses
- class: Errors
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of server_errors
- calc: $this * 100 / $1m_requests
- units: %
- every: 10s
- warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
- delay: up 2m down 15m multiplier 1.5 max 1h
- info: ratio of server error HTTP requests over the last minute (5xx)
- to: webmaster
-
-# unmatched lines
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_total_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 1m_total_requests
- on: web_log.response_codes
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned
- calc: ($this == 0)?(1):($this)
- units: requests
- every: 10s
- info: number of HTTP requests over the last minute
-
- template: 1m_unmatched
- on: web_log.response_codes
- class: Errors
- type: Web Server
-component: Web log
- families: *
- lookup: sum -1m unaligned of unmatched
- calc: $this * 100 / $1m_total_requests
- units: %
- every: 10s
- warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
- delay: up 1m down 5m multiplier 1.5 max 1h
- info: percentage of unparsed log lines over the last minute
- to: webmaster
-
-# -----------------------------------------------------------------------------
-# web slow
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $1m_requests > 120
-#
-# i.e. when there are at least 120 requests during the last minute
-
- template: 10m_response_time
- on: web_log.response_time
- class: Latency
- type: System
-component: Web log
- families: *
- lookup: average -10m unaligned of avg
- units: ms
- every: 30s
- info: average HTTP response time over the last 10 minutes
-
- template: web_slow
- on: web_log.response_time
- class: Latency
- type: Web Server
-component: Web log
- families: *
- lookup: average -1m unaligned of avg
- units: ms
- every: 10s
- green: 500
- red: 1000
- warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
- crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
- delay: down 15m multiplier 1.5 max 1h
- info: average HTTP response time over the last minute
- options: no-clear-notification
- to: webmaster
-
-# -----------------------------------------------------------------------------
-# web too many or too few requests
-
-# the following alarms trigger only when there are enough data.
-# we assume there are enough data when:
-#
-# $5m_successful_old > 120
-#
-# i.e. when there were at least 120 requests during the 5 minutes starting
-# at -10m and ending at -5m
-
- template: 5m_successful_old
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: average -5m at -5m unaligned of successful_requests
- units: requests/s
- every: 30s
- info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
-
- template: 5m_successful
- on: web_log.response_statuses
- class: Workload
- type: Web Server
-component: Web log
- families: *
- lookup: average -5m unaligned of successful_requests
- units: requests/s
- every: 30s
- info: average number of successful HTTP requests over the last 5 minutes
-
- template: 5m_requests_ratio
- on: web_log.response_codes
- class: Workload
- type: Web Server
-component: Web log
- families: *
- calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
- units: %
- every: 30s
- warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
- crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
- delay: down 15m multiplier 1.5 max 1h
- options: no-clear-notification
- info: ratio of successful HTTP requests over the last 5 minutes, \
- compared with the previous 5 minutes \
- (clear notification for this alarm will not be sent)
- to: webmaster
-
-
-
-# ---------------------------------------------------GO-VERSION---------------------------------------------------------
-
# unmatched lines
# the following alarms trigger only when there are enough data.