# you can disable an alarm notification by setting the 'to' line to: silent template: cgroup_10min_cpu_usage on: cgroup.cpu_limit class: Utilization type: Cgroups component: CPU os: linux hosts: * lookup: average -10m unaligned units: % every: 1m warn: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cgroup CPU utilization over the last 10 minutes to: silent template: cgroup_ram_in_use on: cgroup.mem_usage class: Utilization type: Cgroups component: Memory os: linux hosts: * calc: ($ram) * 100 / $memory_limit units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h info: cgroup memory utilization to: silent # FIXME COMMENTED DUE TO A BUG IN NETDATA ## ----------------------------------------------------------------------------- ## check for packet storms # ## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate ## 2. do the same for the last 10s ## 3. raise an alarm if the later is 10x or 20x the first ## we assume the minimum packet storm should at least have ## 10000 packets/s, average of the last 10 seconds # # template: cgroup_1m_received_packets_rate # on: cgroup.net_packets # class: Workload # type: Cgroups #component: Network # hosts: * # lookup: average -1m unaligned of received # units: packets # every: 10s # info: average number of packets received by the network interface ${label:device} over the last minute # # template: cgroup_10s_received_packets_storm # on: cgroup.net_packets # class: Workload # type: Cgroups #component: Network # hosts: * # lookup: average -10s unaligned of received # calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) # every: 10s # units: % # warn: $this > (($status >= $WARNING)?(200):(5000)) # options: no-clear-notification # info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ # compared to the rate over the last minute # to: sysadmin # # ---------------------------------K8s containers-------------------------------------------- template: k8s_cgroup_10min_cpu_usage on: k8s.cgroup.cpu_limit class: Utilization type: Cgroups component: CPU os: linux hosts: * lookup: average -10m unaligned units: % every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) delay: down 15m multiplier 1.5 max 1h info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ average CPU utilization over the last 10 minutes to: silent template: k8s_cgroup_ram_in_use on: k8s.cgroup.mem_usage class: Utilization type: Cgroups component: Memory os: linux hosts: * calc: ($ram) * 100 / $memory_limit units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ memory utilization to: silent # check for packet storms # FIXME COMMENTED DUE TO A BUG IN NETDATA ## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate ## 2. do the same for the last 10s ## 3. raise an alarm if the later is 10x or 20x the first ## we assume the minimum packet storm should at least have ## 10000 packets/s, average of the last 10 seconds # # template: k8s_cgroup_1m_received_packets_rate # on: k8s.cgroup.net_packets # class: Workload # type: Cgroups #component: Network # hosts: * # lookup: average -1m unaligned of received # units: packets # every: 10s # info: average number of packets received by the network interface ${label:device} over the last minute # # template: k8s_cgroup_10s_received_packets_storm # on: k8s.cgroup.net_packets # class: Workload # type: Cgroups #component: Network # hosts: * # lookup: average -10s unaligned of received # calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) # every: 10s # units: % # warn: $this > (($status >= $WARNING)?(200):(5000)) # options: no-clear-notification # info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ # compared to the rate over the last minute # to: sysadmin