summaryrefslogtreecommitdiffstats
path: root/health/health.d/kubelet.conf
blob: 8adf5f7d47bc7d221798901fee8b5a21c91f1bef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# you can disable an alarm notification by setting the 'to' line to: silent

# -----------------------------------------------------------------------------

# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.

 template: kubelet_node_config_error
       on: k8s_kubelet.kubelet_node_config_error
    class: Errors
     type: Kubernetes
component: Kubelet
     calc: $experiencing_error
    units: bool
    every: 10s
     warn: $this == 1
    delay: down 1m multiplier 1.5 max 2h
  summary: Kubelet node config error
     info: The node is experiencing a configuration-related error (0: false, 1: true)
       to: sysadmin

# Failed Token() requests to the alternate token source

 template: kubelet_token_requests
       on: k8s_kubelet.kubelet_token_requests
    class: Errors
     type: Kubernetes
component: Kubelet
   lookup: sum -10s of failed
    units: requests
    every: 10s
     warn: $this > 0
    delay: down 1m multiplier 1.5 max 2h
  summary: Kubelet failed token requests
     info: Number of failed Token() requests to the alternate token source
       to: sysadmin

# Docker and runtime operation errors

 template: kubelet_operations_error
       on: k8s_kubelet.kubelet_operations_errors
    class: Errors
     type: Kubernetes
component: Kubelet
   lookup: sum -1m
    units: errors
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (0) : (20))
    delay: up 30s down 1m multiplier 1.5 max 2h
  summary: Kubelet runtime errors
     info: Number of Docker or runtime operation errors
       to: sysadmin

# -----------------------------------------------------------------------------

# Pod Lifecycle Event Generator Relisting Latency

# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
# 2. do the same for the last 10s
# 3. raise an alarm if the later is:
# - 2x the first for quantile 0.5
# - 4x the first for quantile 0.9
# - 8x the first for quantile 0.99
#
# we assume the minimum latency is 1000 microseconds

# quantile 0.5

 template: kubelet_1m_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -1m unaligned of 0.5
    units: microseconds
    every: 10s
     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)

 template: kubelet_10s_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -10s unaligned of 0.5
     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(100):(200))
     crit: $this > (($status >= $WARNING)?(200):(400))
    delay: down 1m multiplier 1.5 max 2h
  summary: Kubelet relisting latency (quantile 0.5)
     info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
           compared to the last minute (quantile 0.5)
       to: sysadmin

# quantile 0.9

 template: kubelet_1m_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -1m unaligned of 0.9
    units: microseconds
    every: 10s
     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)

 template: kubelet_10s_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -10s unaligned of 0.9
     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(200):(400))
     crit: $this > (($status >= $WARNING)?(400):(800))
    delay: down 1m multiplier 1.5 max 2h
  summary: Kubelet relisting latency (quantile 0.9)
     info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
           compared to the last minute (quantile 0.9)
       to: sysadmin

# quantile 0.99

 template: kubelet_1m_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -1m unaligned of 0.99
    units: microseconds
    every: 10s
     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)

 template: kubelet_10s_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
    class: Latency
     type: Kubernetes
component: Kubelet
   lookup: average -10s unaligned of 0.99
     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(400):(800))
     crit: $this > (($status >= $WARNING)?(800):(1200))
    delay: down 1m multiplier 1.5 max 2h
  summary: Kubelet relisting latency (quantile 0.99)
     info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
           compared to the last minute (quantile 0.99)
       to: sysadmin