blob: 8adf5f7d47bc7d221798901fee8b5a21c91f1bef (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
|
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
template: kubelet_node_config_error
on: k8s_kubelet.kubelet_node_config_error
class: Errors
type: Kubernetes
component: Kubelet
calc: $experiencing_error
units: bool
every: 10s
warn: $this == 1
delay: down 1m multiplier 1.5 max 2h
summary: Kubelet node config error
info: The node is experiencing a configuration-related error (0: false, 1: true)
to: sysadmin
# Failed Token() requests to the alternate token source
template: kubelet_token_requests
on: k8s_kubelet.kubelet_token_requests
class: Errors
type: Kubernetes
component: Kubelet
lookup: sum -10s of failed
units: requests
every: 10s
warn: $this > 0
delay: down 1m multiplier 1.5 max 2h
summary: Kubelet failed token requests
info: Number of failed Token() requests to the alternate token source
to: sysadmin
# Docker and runtime operation errors
template: kubelet_operations_error
on: k8s_kubelet.kubelet_operations_errors
class: Errors
type: Kubernetes
component: Kubelet
lookup: sum -1m
units: errors
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (20))
delay: up 30s down 1m multiplier 1.5 max 2h
summary: Kubelet runtime errors
info: Number of Docker or runtime operation errors
to: sysadmin
# -----------------------------------------------------------------------------
# Pod Lifecycle Event Generator Relisting Latency
# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
# 2. do the same for the last 10s
# 3. raise an alarm if the later is:
# - 2x the first for quantile 0.5
# - 4x the first for quantile 0.9
# - 8x the first for quantile 0.99
#
# we assume the minimum latency is 1000 microseconds
# quantile 0.5
template: kubelet_1m_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Latency
type: Kubernetes
component: Kubelet
lookup: average -1m unaligned of 0.5
units: microseconds
every: 10s
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
template: kubelet_10s_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Latency
type: Kubernetes
component: Kubelet
lookup: average -10s unaligned of 0.5
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(100):(200))
crit: $this > (($status >= $WARNING)?(200):(400))
delay: down 1m multiplier 1.5 max 2h
summary: Kubelet relisting latency (quantile 0.5)
info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
compared to the last minute (quantile 0.5)
to: sysadmin
# quantile 0.9
template: kubelet_1m_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Latency
type: Kubernetes
component: Kubelet
lookup: average -1m unaligned of 0.9
units: microseconds
every: 10s
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
template: kubelet_10s_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Latency
type: Kubernetes
component: Kubelet
lookup: average -10s unaligned of 0.9
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(400))
crit: $this > (($status >= $WARNING)?(400):(800))
delay: down 1m multiplier 1.5 max 2h
summary: Kubelet relisting latency (quantile 0.9)
info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
compared to the last minute (quantile 0.9)
to: sysadmin
# quantile 0.99
template: kubelet_1m_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Latency
type: Kubernetes
component: Kubelet
lookup: average -1m unaligned of 0.99
units: microseconds
every: 10s
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
template: kubelet_10s_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Latency
type: Kubernetes
component: Kubelet
lookup: average -10s unaligned of 0.99
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(400):(800))
crit: $this > (($status >= $WARNING)?(800):(1200))
delay: down 1m multiplier 1.5 max 2h
summary: Kubelet relisting latency (quantile 0.99)
info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
compared to the last minute (quantile 0.99)
to: sysadmin
|