summaryrefslogtreecommitdiffstats
path: root/health/health.d/cgroups.conf
blob: 53a6ea00ff714cbea1b8d463425fe98f7602fdeb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# you can disable an alarm notification by setting the 'to' line to: silent

 template: cgroup_10min_cpu_usage
       on: cgroup.cpu_limit
    class: Utilization
     type: Cgroups
component: CPU
       os: linux
    hosts: *
   lookup: average -10m unaligned
    units: %
    every: 1m
     warn: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
     info: average cgroup CPU utilization over the last 10 minutes
       to: silent

 template: cgroup_ram_in_use
       on: cgroup.mem_usage
    class: Utilization
     type: Cgroups
component: Memory
       os: linux
    hosts: *
     calc: ($ram) * 100 / $memory_limit
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
     info: cgroup memory utilization
       to: silent

# FIXME COMMENTED DUE TO A BUG IN NETDATA
## -----------------------------------------------------------------------------
## check for packet storms
#
## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
## 2. do the same for the last 10s
## 3. raise an alarm if the later is 10x or 20x the first
## we assume the minimum packet storm should at least have
## 10000 packets/s, average of the last 10 seconds
#
# template: cgroup_1m_received_packets_rate
#       on: cgroup.net_packets
#    class: Workload
#     type: Cgroups
#component: Network
#    hosts: *
#   lookup: average -1m unaligned of received
#    units: packets
#    every: 10s
#     info: average number of packets received by the network interface ${label:device} over the last minute
#
# template: cgroup_10s_received_packets_storm
#       on: cgroup.net_packets
#    class: Workload
#     type: Cgroups
#component: Network
#    hosts: *
#   lookup: average -10s unaligned of received
#     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
#    every: 10s
#    units: %
#     warn: $this > (($status >= $WARNING)?(200):(5000))
#  options: no-clear-notification
#     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
#           compared to the rate over the last minute
#       to: sysadmin
#
# ---------------------------------K8s containers--------------------------------------------

 template: k8s_cgroup_10min_cpu_usage
       on: k8s.cgroup.cpu_limit
    class: Utilization
     type: Cgroups
component: CPU
       os: linux
    hosts: *
   lookup: average -10m unaligned
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
    delay: down 15m multiplier 1.5 max 1h
     info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
           average CPU utilization over the last 10 minutes
       to: silent

 template: k8s_cgroup_ram_in_use
       on: k8s.cgroup.mem_usage
    class: Utilization
     type: Cgroups
component: Memory
       os: linux
    hosts: *
     calc: ($ram) * 100 / $memory_limit
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
     info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
           memory utilization
       to: silent

# check for packet storms

# FIXME COMMENTED DUE TO A BUG IN NETDATA
## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
## 2. do the same for the last 10s
## 3. raise an alarm if the later is 10x or 20x the first
## we assume the minimum packet storm should at least have
## 10000 packets/s, average of the last 10 seconds
#
# template: k8s_cgroup_1m_received_packets_rate
#       on: k8s.cgroup.net_packets
#    class: Workload
#     type: Cgroups
#component: Network
#    hosts: *
#   lookup: average -1m unaligned of received
#    units: packets
#    every: 10s
#     info: average number of packets received by the network interface ${label:device} over the last minute
#
# template: k8s_cgroup_10s_received_packets_storm
#       on: k8s.cgroup.net_packets
#    class: Workload
#     type: Cgroups
#component: Network
#    hosts: *
#   lookup: average -10s unaligned of received
#     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
#    every: 10s
#    units: %
#     warn: $this > (($status >= $WARNING)?(200):(5000))
#  options: no-clear-notification
#     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
#           compared to the rate over the last minute
#       to: sysadmin