diff options
Diffstat (limited to 'health/health.d/cgroups.conf')
-rw-r--r-- | health/health.d/cgroups.conf | 155 |
1 files changed, 79 insertions, 76 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index f625e5455..53a6ea00f 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -11,11 +11,10 @@ component: CPU lookup: average -10m unaligned units: % every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) + warn: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cgroup CPU utilization over the last 10 minutes - to: sysadmin + to: silent template: cgroup_ram_in_use on: cgroup.mem_usage @@ -31,44 +30,45 @@ component: Memory crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h info: cgroup memory utilization - to: sysadmin - -# ----------------------------------------------------------------------------- -# check for packet storms - -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: cgroup_1m_received_packets_rate - on: cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: average number of packets received by the network interface ${label:device} over the last minute - - template: cgroup_10s_received_packets_storm - on: cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - options: no-clear-notification - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: sysadmin + to: silent +# FIXME COMMENTED DUE TO A BUG IN NETDATA +## ----------------------------------------------------------------------------- +## check for packet storms +# +## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +## 2. do the same for the last 10s +## 3. raise an alarm if the later is 10x or 20x the first +## we assume the minimum packet storm should at least have +## 10000 packets/s, average of the last 10 seconds +# +# template: cgroup_1m_received_packets_rate +# on: cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -1m unaligned of received +# units: packets +# every: 10s +# info: average number of packets received by the network interface ${label:device} over the last minute +# +# template: cgroup_10s_received_packets_storm +# on: cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -10s unaligned of received +# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) +# every: 10s +# units: % +# warn: $this > (($status >= $WARNING)?(200):(5000)) +# options: no-clear-notification +# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ +# compared to the rate over the last minute +# to: sysadmin +# # ---------------------------------K8s containers-------------------------------------------- template: k8s_cgroup_10min_cpu_usage @@ -83,8 +83,9 @@ component: CPU every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) delay: down 15m multiplier 1.5 max 1h - info: average cgroup CPU utilization over the last 10 minutes - to: sysadmin + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent template: k8s_cgroup_ram_in_use on: k8s.cgroup.mem_usage @@ -99,40 +100,42 @@ component: Memory warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: cgroup memory utilization - to: sysadmin + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent # check for packet storms -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: k8s_cgroup_1m_received_packets_rate - on: k8s.cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: average number of packets received by the network interface ${label:device} over the last minute - - template: k8s_cgroup_10s_received_packets_storm - on: k8s.cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - options: no-clear-notification - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: sysadmin +# FIXME COMMENTED DUE TO A BUG IN NETDATA +## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +## 2. do the same for the last 10s +## 3. raise an alarm if the later is 10x or 20x the first +## we assume the minimum packet storm should at least have +## 10000 packets/s, average of the last 10 seconds +# +# template: k8s_cgroup_1m_received_packets_rate +# on: k8s.cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -1m unaligned of received +# units: packets +# every: 10s +# info: average number of packets received by the network interface ${label:device} over the last minute +# +# template: k8s_cgroup_10s_received_packets_storm +# on: k8s.cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -10s unaligned of received +# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) +# every: 10s +# units: % +# warn: $this > (($status >= $WARNING)?(200):(5000)) +# options: no-clear-notification +# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ +# compared to the rate over the last minute +# to: sysadmin |