diff options
Diffstat (limited to 'health/health.d/kubelet.conf')
-rw-r--r-- | health/health.d/kubelet.conf | 195 |
1 files changed, 111 insertions, 84 deletions
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index 5eda59b2..4d3c45f9 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -4,39 +4,48 @@ # True (1) if the node is experiencing a configuration-related error, false (0) otherwise. - template: kubelet_node_config_error - on: k8s_kubelet.kubelet_node_config_error - calc: $kubelet_node_config_error - units: bool - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 2h - info: the node is experiencing a configuration-related error (0: false, 1: true) - to: sysadmin + template: kubelet_node_config_error + on: k8s_kubelet.kubelet_node_config_error + class: Kubernetes +component: Kubelet + type: Errors + calc: $kubelet_node_config_error + units: bool + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 2h + info: the node is experiencing a configuration-related error (0: false, 1: true) + to: sysadmin # Failed Token() requests to the alternate token source - template: kubelet_token_requests - lookup: sum -10s of token_fail_count - on: k8s_kubelet.kubelet_token_requests - units: failed requests - every: 10s - warn: $this > 0 - delay: down 1m multiplier 1.5 max 2h - info: number of failed Token() requests to the alternate token source - to: sysadmin + template: kubelet_token_requests + lookup: sum -10s of token_fail_count + on: k8s_kubelet.kubelet_token_requests + class: Kubernetes +component: Kubelet + type: Errors + units: failed requests + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 2h + info: number of failed Token() requests to the alternate token source + to: sysadmin # Docker and runtime operation errors - template: kubelet_operations_error - lookup: sum -1m - on: k8s_kubelet.kubelet_operations_errors - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (20)) - delay: up 30s down 1m multiplier 1.5 max 2h - info: number of Docker or runtime operation errors - to: sysadmin + template: kubelet_operations_error + lookup: sum -1m + on: k8s_kubelet.kubelet_operations_errors + class: Kubernetes +component: Kubelet + type: Errors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (20)) + delay: up 30s down 1m multiplier 1.5 max 2h + info: number of Docker or runtime operation errors + to: sysadmin # ----------------------------------------------------------------------------- @@ -53,66 +62,84 @@ # quantile 0.5 -template: kubelet_1m_pleg_relist_latency_quantile_05 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) - -template: kubelet_10s_pleg_relist_latency_quantile_05 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(100):(200)) - crit: $this > (($status >= $WARNING)?(200):(400)) - delay: down 1m multiplier 1.5 max 2h - info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.5) - to: sysadmin + template: kubelet_1m_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Kubernetes +component: Kubelet + type: Latency + lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) + + template: kubelet_10s_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Kubernetes +component: Kubelet + type: Latency + lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(100):(200)) + crit: $this > (($status >= $WARNING)?(200):(400)) + delay: down 1m multiplier 1.5 max 2h + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.5) + to: sysadmin # quantile 0.9 -template: kubelet_1m_pleg_relist_latency_quantile_09 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) - -template: kubelet_10s_pleg_relist_latency_quantile_09 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(400)) - crit: $this > (($status >= $WARNING)?(400):(800)) - delay: down 1m multiplier 1.5 max 2h - info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.9) - to: sysadmin + template: kubelet_1m_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Kubernetes +component: Kubelet + type: Latency + lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) + + template: kubelet_10s_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Kubernetes +component: Kubelet + type: Latency + lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(400)) + crit: $this > (($status >= $WARNING)?(400):(800)) + delay: down 1m multiplier 1.5 max 2h + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.9) + to: sysadmin # quantile 0.99 -template: kubelet_1m_pleg_relist_latency_quantile_099 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) - -template: kubelet_10s_pleg_relist_latency_quantile_099 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(400):(800)) - crit: $this > (($status >= $WARNING)?(800):(1200)) - delay: down 1m multiplier 1.5 max 2h - info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.99) - to: sysadmin + template: kubelet_1m_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Kubernetes +component: Kubelet + type: Latency + lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) + + template: kubelet_10s_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Kubernetes +component: Kubelet + type: Latency + lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(400):(800)) + crit: $this > (($status >= $WARNING)?(800):(1200)) + delay: down 1m multiplier 1.5 max 2h + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.99) + to: sysadmin |