blob: f625e5455d7ad2c63a164a62fabbe4a98d2b6fd1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
# you can disable an alarm notification by setting the 'to' line to: silent
template: cgroup_10min_cpu_usage
on: cgroup.cpu_limit
class: Utilization
type: Cgroups
component: CPU
os: linux
hosts: *
lookup: average -10m unaligned
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
to: sysadmin
template: cgroup_ram_in_use
on: cgroup.mem_usage
class: Utilization
type: Cgroups
component: Memory
os: linux
hosts: *
calc: ($ram) * 100 / $memory_limit
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: cgroup memory utilization
to: sysadmin
# -----------------------------------------------------------------------------
# check for packet storms
# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
# 2. do the same for the last 10s
# 3. raise an alarm if the later is 10x or 20x the first
# we assume the minimum packet storm should at least have
# 10000 packets/s, average of the last 10 seconds
template: cgroup_1m_received_packets_rate
on: cgroup.net_packets
class: Workload
type: Cgroups
component: Network
hosts: *
lookup: average -1m unaligned of received
units: packets
every: 10s
info: average number of packets received by the network interface ${label:device} over the last minute
template: cgroup_10s_received_packets_storm
on: cgroup.net_packets
class: Workload
type: Cgroups
component: Network
hosts: *
lookup: average -10s unaligned of received
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
to: sysadmin
# ---------------------------------K8s containers--------------------------------------------
template: k8s_cgroup_10min_cpu_usage
on: k8s.cgroup.cpu_limit
class: Utilization
type: Cgroups
component: CPU
os: linux
hosts: *
lookup: average -10m unaligned
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
to: sysadmin
template: k8s_cgroup_ram_in_use
on: k8s.cgroup.mem_usage
class: Utilization
type: Cgroups
component: Memory
os: linux
hosts: *
calc: ($ram) * 100 / $memory_limit
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: cgroup memory utilization
to: sysadmin
# check for packet storms
# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
# 2. do the same for the last 10s
# 3. raise an alarm if the later is 10x or 20x the first
# we assume the minimum packet storm should at least have
# 10000 packets/s, average of the last 10 seconds
template: k8s_cgroup_1m_received_packets_rate
on: k8s.cgroup.net_packets
class: Workload
type: Cgroups
component: Network
hosts: *
lookup: average -1m unaligned of received
units: packets
every: 10s
info: average number of packets received by the network interface ${label:device} over the last minute
template: k8s_cgroup_10s_received_packets_storm
on: k8s.cgroup.net_packets
class: Workload
type: Cgroups
component: Network
hosts: *
lookup: average -10s unaligned of received
calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
to: sysadmin
|