diff options
Diffstat (limited to 'health/health.d/kubelet.conf')
-rw-r--r-- | health/health.d/kubelet.conf | 151 |
1 files changed, 0 insertions, 151 deletions
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf deleted file mode 100644 index 8adf5f7d..00000000 --- a/health/health.d/kubelet.conf +++ /dev/null @@ -1,151 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- - -# True (1) if the node is experiencing a configuration-related error, false (0) otherwise. - - template: kubelet_node_config_error - on: k8s_kubelet.kubelet_node_config_error - class: Errors - type: Kubernetes -component: Kubelet - calc: $experiencing_error - units: bool - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet node config error - info: The node is experiencing a configuration-related error (0: false, 1: true) - to: sysadmin - -# Failed Token() requests to the alternate token source - - template: kubelet_token_requests - on: k8s_kubelet.kubelet_token_requests - class: Errors - type: Kubernetes -component: Kubelet - lookup: sum -10s of failed - units: requests - every: 10s - warn: $this > 0 - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet failed token requests - info: Number of failed Token() requests to the alternate token source - to: sysadmin - -# Docker and runtime operation errors - - template: kubelet_operations_error - on: k8s_kubelet.kubelet_operations_errors - class: Errors - type: Kubernetes -component: Kubelet - lookup: sum -1m - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (20)) - delay: up 30s down 1m multiplier 1.5 max 2h - summary: Kubelet runtime errors - info: Number of Docker or runtime operation errors - to: sysadmin - -# ----------------------------------------------------------------------------- - -# Pod Lifecycle Event Generator Relisting Latency - -# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) -# 2. do the same for the last 10s -# 3. raise an alarm if the later is: -# - 2x the first for quantile 0.5 -# - 4x the first for quantile 0.9 -# - 8x the first for quantile 0.99 -# -# we assume the minimum latency is 1000 microseconds - -# quantile 0.5 - - template: kubelet_1m_pleg_relist_latency_quantile_05 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -1m unaligned of 0.5 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) - - template: kubelet_10s_pleg_relist_latency_quantile_05 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -10s unaligned of 0.5 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(100):(200)) - crit: $this > (($status >= $WARNING)?(200):(400)) - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet relisting latency (quantile 0.5) - info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.5) - to: sysadmin - -# quantile 0.9 - - template: kubelet_1m_pleg_relist_latency_quantile_09 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -1m unaligned of 0.9 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) - - template: kubelet_10s_pleg_relist_latency_quantile_09 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -10s unaligned of 0.9 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(400)) - crit: $this > (($status >= $WARNING)?(400):(800)) - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet relisting latency (quantile 0.9) - info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.9) - to: sysadmin - -# quantile 0.99 - - template: kubelet_1m_pleg_relist_latency_quantile_099 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -1m unaligned of 0.99 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) - - template: kubelet_10s_pleg_relist_latency_quantile_099 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -10s unaligned of 0.99 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(400):(800)) - crit: $this > (($status >= $WARNING)?(800):(1200)) - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet relisting latency (quantile 0.99) - info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.99) - to: sysadmin |