blob: 53a6ea00ff714cbea1b8d463425fe98f7602fdeb (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
# you can disable an alarm notification by setting the 'to' line to: silent
template: cgroup_10min_cpu_usage
on: cgroup.cpu_limit
class: Utilization
type: Cgroups
component: CPU
os: linux
hosts: *
lookup: average -10m unaligned
units: %
every: 1m
warn: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
to: silent
template: cgroup_ram_in_use
on: cgroup.mem_usage
class: Utilization
type: Cgroups
component: Memory
os: linux
hosts: *
calc: ($ram) * 100 / $memory_limit
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: cgroup memory utilization
to: silent
# FIXME COMMENTED DUE TO A BUG IN NETDATA
## -----------------------------------------------------------------------------
## check for packet storms
#
## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
## 2. do the same for the last 10s
## 3. raise an alarm if the later is 10x or 20x the first
## we assume the minimum packet storm should at least have
## 10000 packets/s, average of the last 10 seconds
#
# template: cgroup_1m_received_packets_rate
# on: cgroup.net_packets
# class: Workload
# type: Cgroups
#component: Network
# hosts: *
# lookup: average -1m unaligned of received
# units: packets
# every: 10s
# info: average number of packets received by the network interface ${label:device} over the last minute
#
# template: cgroup_10s_received_packets_storm
# on: cgroup.net_packets
# class: Workload
# type: Cgroups
#component: Network
# hosts: *
# lookup: average -10s unaligned of received
# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
# every: 10s
# units: %
# warn: $this > (($status >= $WARNING)?(200):(5000))
# options: no-clear-notification
# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
# compared to the rate over the last minute
# to: sysadmin
#
# ---------------------------------K8s containers--------------------------------------------
template: k8s_cgroup_10min_cpu_usage
on: k8s.cgroup.cpu_limit
class: Utilization
type: Cgroups
component: CPU
os: linux
hosts: *
lookup: average -10m unaligned
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
delay: down 15m multiplier 1.5 max 1h
info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
average CPU utilization over the last 10 minutes
to: silent
template: k8s_cgroup_ram_in_use
on: k8s.cgroup.mem_usage
class: Utilization
type: Cgroups
component: Memory
os: linux
hosts: *
calc: ($ram) * 100 / $memory_limit
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
memory utilization
to: silent
# check for packet storms
# FIXME COMMENTED DUE TO A BUG IN NETDATA
## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
## 2. do the same for the last 10s
## 3. raise an alarm if the later is 10x or 20x the first
## we assume the minimum packet storm should at least have
## 10000 packets/s, average of the last 10 seconds
#
# template: k8s_cgroup_1m_received_packets_rate
# on: k8s.cgroup.net_packets
# class: Workload
# type: Cgroups
#component: Network
# hosts: *
# lookup: average -1m unaligned of received
# units: packets
# every: 10s
# info: average number of packets received by the network interface ${label:device} over the last minute
#
# template: k8s_cgroup_10s_received_packets_storm
# on: k8s.cgroup.net_packets
# class: Workload
# type: Cgroups
#component: Network
# hosts: *
# lookup: average -10s unaligned of received
# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
# every: 10s
# units: %
# warn: $this > (($status >= $WARNING)?(200):(5000))
# options: no-clear-notification
# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
# compared to the rate over the last minute
# to: sysadmin
|