summaryrefslogtreecommitdiffstats
path: root/health/health.d/cgroups.conf
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d/cgroups.conf')
-rw-r--r--health/health.d/cgroups.conf155
1 files changed, 79 insertions, 76 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index f625e5455..53a6ea00f 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -11,11 +11,10 @@ component: CPU
lookup: average -10m unaligned
units: %
every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ warn: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
- to: sysadmin
+ to: silent
template: cgroup_ram_in_use
on: cgroup.mem_usage
@@ -31,44 +30,45 @@ component: Memory
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: cgroup memory utilization
- to: sysadmin
-
-# -----------------------------------------------------------------------------
-# check for packet storms
-
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: cgroup_1m_received_packets_rate
- on: cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -1m unaligned of received
- units: packets
- every: 10s
- info: average number of packets received by the network interface ${label:device} over the last minute
-
- template: cgroup_10s_received_packets_storm
- on: cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -10s unaligned of received
- calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
- every: 10s
- units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- options: no-clear-notification
- info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
- compared to the rate over the last minute
- to: sysadmin
+ to: silent
+# FIXME COMMENTED DUE TO A BUG IN NETDATA
+## -----------------------------------------------------------------------------
+## check for packet storms
+#
+## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+## 2. do the same for the last 10s
+## 3. raise an alarm if the later is 10x or 20x the first
+## we assume the minimum packet storm should at least have
+## 10000 packets/s, average of the last 10 seconds
+#
+# template: cgroup_1m_received_packets_rate
+# on: cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -1m unaligned of received
+# units: packets
+# every: 10s
+# info: average number of packets received by the network interface ${label:device} over the last minute
+#
+# template: cgroup_10s_received_packets_storm
+# on: cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -10s unaligned of received
+# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+# every: 10s
+# units: %
+# warn: $this > (($status >= $WARNING)?(200):(5000))
+# options: no-clear-notification
+# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+# compared to the rate over the last minute
+# to: sysadmin
+#
# ---------------------------------K8s containers--------------------------------------------
template: k8s_cgroup_10min_cpu_usage
@@ -83,8 +83,9 @@ component: CPU
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
delay: down 15m multiplier 1.5 max 1h
- info: average cgroup CPU utilization over the last 10 minutes
- to: sysadmin
+ info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+ average CPU utilization over the last 10 minutes
+ to: silent
template: k8s_cgroup_ram_in_use
on: k8s.cgroup.mem_usage
@@ -99,40 +100,42 @@ component: Memory
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: cgroup memory utilization
- to: sysadmin
+ info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+ memory utilization
+ to: silent
# check for packet storms
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: k8s_cgroup_1m_received_packets_rate
- on: k8s.cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -1m unaligned of received
- units: packets
- every: 10s
- info: average number of packets received by the network interface ${label:device} over the last minute
-
- template: k8s_cgroup_10s_received_packets_storm
- on: k8s.cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -10s unaligned of received
- calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
- every: 10s
- units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- options: no-clear-notification
- info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
- compared to the rate over the last minute
- to: sysadmin
+# FIXME COMMENTED DUE TO A BUG IN NETDATA
+## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+## 2. do the same for the last 10s
+## 3. raise an alarm if the later is 10x or 20x the first
+## we assume the minimum packet storm should at least have
+## 10000 packets/s, average of the last 10 seconds
+#
+# template: k8s_cgroup_1m_received_packets_rate
+# on: k8s.cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -1m unaligned of received
+# units: packets
+# every: 10s
+# info: average number of packets received by the network interface ${label:device} over the last minute
+#
+# template: k8s_cgroup_10s_received_packets_storm
+# on: k8s.cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -10s unaligned of received
+# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+# every: 10s
+# units: %
+# warn: $this > (($status >= $WARNING)?(200):(5000))
+# options: no-clear-notification
+# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+# compared to the rate over the last minute
+# to: sysadmin