diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-08-10 09:18:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-08-10 09:18:49 +0000 |
commit | dd814a7c1a8de056a79f7238578b09236edd5506 (patch) | |
tree | 429e7eed5a634a4efe9a6877ce66da8e64aa1782 /health/health.d/cgroups.conf | |
parent | Adding upstream version 1.41.0. (diff) | |
download | netdata-dd814a7c1a8de056a79f7238578b09236edd5506.tar.xz netdata-dd814a7c1a8de056a79f7238578b09236edd5506.zip |
Adding upstream version 1.42.0.upstream/1.42.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d/cgroups.conf')
-rw-r--r-- | health/health.d/cgroups.conf | 155 |
1 files changed, 79 insertions, 76 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index f625e545..53a6ea00 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -11,11 +11,10 @@ component: CPU lookup: average -10m unaligned units: % every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) + warn: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cgroup CPU utilization over the last 10 minutes - to: sysadmin + to: silent template: cgroup_ram_in_use on: cgroup.mem_usage @@ -31,44 +30,45 @@ component: Memory crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h info: cgroup memory utilization - to: sysadmin - -# ----------------------------------------------------------------------------- -# check for packet storms - -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: cgroup_1m_received_packets_rate - on: cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: average number of packets received by the network interface ${label:device} over the last minute - - template: cgroup_10s_received_packets_storm - on: cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - options: no-clear-notification - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: sysadmin + to: silent +# FIXME COMMENTED DUE TO A BUG IN NETDATA +## ----------------------------------------------------------------------------- +## check for packet storms +# +## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +## 2. do the same for the last 10s +## 3. raise an alarm if the later is 10x or 20x the first +## we assume the minimum packet storm should at least have +## 10000 packets/s, average of the last 10 seconds +# +# template: cgroup_1m_received_packets_rate +# on: cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -1m unaligned of received +# units: packets +# every: 10s +# info: average number of packets received by the network interface ${label:device} over the last minute +# +# template: cgroup_10s_received_packets_storm +# on: cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -10s unaligned of received +# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) +# every: 10s +# units: % +# warn: $this > (($status >= $WARNING)?(200):(5000)) +# options: no-clear-notification +# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ +# compared to the rate over the last minute +# to: sysadmin +# # ---------------------------------K8s containers-------------------------------------------- template: k8s_cgroup_10min_cpu_usage @@ -83,8 +83,9 @@ component: CPU every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) delay: down 15m multiplier 1.5 max 1h - info: average cgroup CPU utilization over the last 10 minutes - to: sysadmin + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent template: k8s_cgroup_ram_in_use on: k8s.cgroup.mem_usage @@ -99,40 +100,42 @@ component: Memory warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: cgroup memory utilization - to: sysadmin + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent # check for packet storms -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: k8s_cgroup_1m_received_packets_rate - on: k8s.cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: average number of packets received by the network interface ${label:device} over the last minute - - template: k8s_cgroup_10s_received_packets_storm - on: k8s.cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - options: no-clear-notification - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: sysadmin +# FIXME COMMENTED DUE TO A BUG IN NETDATA +## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +## 2. do the same for the last 10s +## 3. raise an alarm if the later is 10x or 20x the first +## we assume the minimum packet storm should at least have +## 10000 packets/s, average of the last 10 seconds +# +# template: k8s_cgroup_1m_received_packets_rate +# on: k8s.cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -1m unaligned of received +# units: packets +# every: 10s +# info: average number of packets received by the network interface ${label:device} over the last minute +# +# template: k8s_cgroup_10s_received_packets_storm +# on: k8s.cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -10s unaligned of received +# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) +# every: 10s +# units: % +# warn: $this > (($status >= $WARNING)?(200):(5000)) +# options: no-clear-notification +# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ +# compared to the rate over the last minute +# to: sysadmin |