summaryrefslogtreecommitdiffstats
path: root/health/health.d/kubelet.conf
blob: c2778cc5eeed9591dc783a1facbd7bea7625c6c8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# you can disable an alarm notification by setting the 'to' line to: silent

# -----------------------------------------------------------------------------

# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.

 template: kubelet_node_config_error
       on: k8s_kubelet.kubelet_node_config_error
    class: Errors
     type: Kubernetes
component: Kubelet
     calc: $kubelet_node_config_error
    units: bool
    every: 10s
     warn: $this == 1
    delay: down 1m multiplier 1.5 max 2h
     info: the node is experiencing a configuration-related error (0: false, 1: true)
       to: sysadmin

# Failed Token() requests to the alternate token source

 template: kubelet_token_requests
   lookup: sum -10s of token_fail_count
       on: k8s_kubelet.kubelet_token_requests
    class: Errors
     type: Kubernetes
component: Kubelet
    units: failed requests
    every: 10s
     warn: $this > 0
    delay: down 1m multiplier 1.5 max 2h
     info: number of failed Token() requests to the alternate token source
       to: sysadmin

# Docker and runtime operation errors

 template: kubelet_operations_error
   lookup: sum -1m
       on: k8s_kubelet.kubelet_operations_errors
    class: Errors
     type: Kubernetes
component: Kubelet
    units: errors
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (0) : (20))
    delay: up 30s down 1m multiplier 1.5 max 2h
     info: number of Docker or runtime operation errors
       to: sysadmin

# -----------------------------------------------------------------------------

# Pod Lifecycle Event Generator Relisting Latency

# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
# 2. do the same for the last 10s
# 3. raise an alarm if the later is:
# - 2x the first for quantile 0.5
# - 4x the first for quantile 0.9
# - 8x the first for quantile 0.99
#
# we assume the minimum latency is 1000 microseconds

# quantile 0.5

 template: kubelet_1m_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
    units: microseconds
    every: 10s
     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)

 template: kubelet_10s_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(100):(200))
     crit: $this > (($status >= $WARNING)?(200):(400))
    delay: down 1m multiplier 1.5 max 2h
     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
           compared to the last minute (quantile 0.5)
       to: sysadmin

# quantile 0.9

 template: kubelet_1m_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
    units: microseconds
    every: 10s
     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)

 template: kubelet_10s_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(200):(400))
     crit: $this > (($status >= $WARNING)?(400):(800))
    delay: down 1m multiplier 1.5 max 2h
     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
           compared to the last minute (quantile 0.9)
       to: sysadmin

# quantile 0.99

 template: kubelet_1m_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
    units: microseconds
    every: 10s
     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)

 template: kubelet_10s_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(400):(800))
     crit: $this > (($status >= $WARNING)?(800):(1200))
    delay: down 1m multiplier 1.5 max 2h
     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
           compared to the last minute (quantile 0.99)
       to: sysadmin