diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/kubelet.conf | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf new file mode 100644 index 00000000..8adf5f7d --- /dev/null +++ b/health/health.d/kubelet.conf @@ -0,0 +1,151 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- + +# True (1) if the node is experiencing a configuration-related error, false (0) otherwise. + + template: kubelet_node_config_error + on: k8s_kubelet.kubelet_node_config_error + class: Errors + type: Kubernetes +component: Kubelet + calc: $experiencing_error + units: bool + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet node config error + info: The node is experiencing a configuration-related error (0: false, 1: true) + to: sysadmin + +# Failed Token() requests to the alternate token source + + template: kubelet_token_requests + on: k8s_kubelet.kubelet_token_requests + class: Errors + type: Kubernetes +component: Kubelet + lookup: sum -10s of failed + units: requests + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet failed token requests + info: Number of failed Token() requests to the alternate token source + to: sysadmin + +# Docker and runtime operation errors + + template: kubelet_operations_error + on: k8s_kubelet.kubelet_operations_errors + class: Errors + type: Kubernetes +component: Kubelet + lookup: sum -1m + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (20)) + delay: up 30s down 1m multiplier 1.5 max 2h + summary: Kubelet runtime errors + info: Number of Docker or runtime operation errors + to: sysadmin + +# ----------------------------------------------------------------------------- + +# Pod Lifecycle Event Generator Relisting Latency + +# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) +# 2. do the same for the last 10s +# 3. raise an alarm if the later is: +# - 2x the first for quantile 0.5 +# - 4x the first for quantile 0.9 +# - 8x the first for quantile 0.99 +# +# we assume the minimum latency is 1000 microseconds + +# quantile 0.5 + + template: kubelet_1m_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -1m unaligned of 0.5 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) + + template: kubelet_10s_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -10s unaligned of 0.5 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(100):(200)) + crit: $this > (($status >= $WARNING)?(200):(400)) + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet relisting latency (quantile 0.5) + info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.5) + to: sysadmin + +# quantile 0.9 + + template: kubelet_1m_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -1m unaligned of 0.9 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) + + template: kubelet_10s_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -10s unaligned of 0.9 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(400)) + crit: $this > (($status >= $WARNING)?(400):(800)) + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet relisting latency (quantile 0.9) + info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.9) + to: sysadmin + +# quantile 0.99 + + template: kubelet_1m_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -1m unaligned of 0.99 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) + + template: kubelet_10s_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -10s unaligned of 0.99 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(400):(800)) + crit: $this > (($status >= $WARNING)?(800):(1200)) + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet relisting latency (quantile 0.99) + info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.99) + to: sysadmin |