summaryrefslogtreecommitdiffstats
path: root/health/health.d/cgroups.conf
blob: 08260ff6d9b5ca302422a887791bb3aa1d1ab4c6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# you can disable an alarm notification by setting the 'to' line to: silent

 template: cgroup_10min_cpu_usage
       on: cgroup.cpu_limit
    class: Utilization
     type: Cgroups
component: CPU
       os: linux
    hosts: *
   lookup: average -10m unaligned
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
     info: average cgroup CPU utilization over the last 10 minutes
       to: sysadmin

 template: cgroup_ram_in_use
       on: cgroup.mem_usage
    class: Utilization
     type: Cgroups
component: Memory
       os: linux
    hosts: *
     calc: ($ram) * 100 / $memory_limit
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
     info: cgroup memory utilization
       to: sysadmin

# -----------------------------------------------------------------------------
# check for packet storms

# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
# 2. do the same for the last 10s
# 3. raise an alarm if the later is 10x or 20x the first
# we assume the minimum packet storm should at least have
# 10000 packets/s, average of the last 10 seconds

 template: cgroup_1m_received_packets_rate
       on: cgroup.net_packets
    class: Workload
     type: Cgroups
component: Network
    hosts: *
   lookup: average -1m unaligned of received
    units: packets
    every: 10s
     info: average number of packets received by the network interface ${label:device} over the last minute

 template: cgroup_10s_received_packets_storm
       on: cgroup.net_packets
    class: Workload
     type: Cgroups
component: Network
    hosts: *
   lookup: average -10s unaligned of received
     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(200):(5000))
     crit: $this > (($status == $CRITICAL)?(5000):(6000))
  options: no-clear-notification
     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
           compared to the rate over the last minute
       to: sysadmin

# ---------------------------------K8s containers--------------------------------------------

 template: k8s_cgroup_10min_cpu_usage
       on: k8s.cgroup.cpu_limit
    class: Utilization
     type: Cgroups
component: CPU
       os: linux
    hosts: *
   lookup: average -10m unaligned
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
     info: average cgroup CPU utilization over the last 10 minutes
       to: sysadmin

 template: k8s_cgroup_ram_in_use
       on: k8s.cgroup.mem_usage
    class: Utilization
     type: Cgroups
component: Memory
       os: linux
    hosts: *
     calc: ($ram) * 100 / $memory_limit
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
     info: cgroup memory utilization
       to: sysadmin

# check for packet storms

# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
# 2. do the same for the last 10s
# 3. raise an alarm if the later is 10x or 20x the first
# we assume the minimum packet storm should at least have
# 10000 packets/s, average of the last 10 seconds

 template: k8s_cgroup_1m_received_packets_rate
       on: k8s.cgroup.net_packets
    class: Workload
     type: Cgroups
component: Network
    hosts: *
   lookup: average -1m unaligned of received
    units: packets
    every: 10s
     info: average number of packets received by the network interface ${label:device} over the last minute

 template: k8s_cgroup_10s_received_packets_storm
       on: k8s.cgroup.net_packets
    class: Workload
     type: Cgroups
component: Network
    hosts: *
   lookup: average -10s unaligned of received
     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(200):(5000))
     crit: $this > (($status == $CRITICAL)?(5000):(6000))
  options: no-clear-notification
     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
           compared to the rate over the last minute
       to: sysadmin