blob: 4d3c45f9710e0891589d506cb0f6834167816b93 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
template: kubelet_node_config_error
on: k8s_kubelet.kubelet_node_config_error
class: Kubernetes
component: Kubelet
type: Errors
calc: $kubelet_node_config_error
units: bool
every: 10s
warn: $this == 1
delay: down 1m multiplier 1.5 max 2h
info: the node is experiencing a configuration-related error (0: false, 1: true)
to: sysadmin
# Failed Token() requests to the alternate token source
template: kubelet_token_requests
lookup: sum -10s of token_fail_count
on: k8s_kubelet.kubelet_token_requests
class: Kubernetes
component: Kubelet
type: Errors
units: failed requests
every: 10s
warn: $this > 0
delay: down 1m multiplier 1.5 max 2h
info: number of failed Token() requests to the alternate token source
to: sysadmin
# Docker and runtime operation errors
template: kubelet_operations_error
lookup: sum -1m
on: k8s_kubelet.kubelet_operations_errors
class: Kubernetes
component: Kubelet
type: Errors
units: errors
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (20))
delay: up 30s down 1m multiplier 1.5 max 2h
info: number of Docker or runtime operation errors
to: sysadmin
# -----------------------------------------------------------------------------
# Pod Lifecycle Event Generator Relisting Latency
# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
# 2. do the same for the last 10s
# 3. raise an alarm if the later is:
# - 2x the first for quantile 0.5
# - 4x the first for quantile 0.9
# - 8x the first for quantile 0.99
#
# we assume the minimum latency is 1000 microseconds
# quantile 0.5
template: kubelet_1m_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Kubernetes
component: Kubelet
type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
units: microseconds
every: 10s
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
template: kubelet_10s_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Kubernetes
component: Kubelet
type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(100):(200))
crit: $this > (($status >= $WARNING)?(200):(400))
delay: down 1m multiplier 1.5 max 2h
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
compared to the last minute (quantile 0.5)
to: sysadmin
# quantile 0.9
template: kubelet_1m_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Kubernetes
component: Kubelet
type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
units: microseconds
every: 10s
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
template: kubelet_10s_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Kubernetes
component: Kubelet
type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(400))
crit: $this > (($status >= $WARNING)?(400):(800))
delay: down 1m multiplier 1.5 max 2h
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
compared to the last minute (quantile 0.9)
to: sysadmin
# quantile 0.99
template: kubelet_1m_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Kubernetes
component: Kubelet
type: Latency
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
units: microseconds
every: 10s
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
template: kubelet_10s_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
class: Kubernetes
component: Kubelet
type: Latency
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(400):(800))
crit: $this > (($status >= $WARNING)?(800):(1200))
delay: down 1m multiplier 1.5 max 2h
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
compared to the last minute (quantile 0.99)
to: sysadmin
|