From 1d63948d79ca6f32889656692d6736c9127f2ee1 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2019 19:57:47 +0200 Subject: Merging upstream version 1.14.0~rc0. Signed-off-by: Daniel Baumann --- health/health.d/kubelet.conf | 115 +++++++++++++++++++++++++++++++++++++++++ health/health.d/sslcheck.conf | 10 ---- health/health.d/x509check.conf | 10 ++++ 3 files changed, 125 insertions(+), 10 deletions(-) create mode 100644 health/health.d/kubelet.conf delete mode 100644 health/health.d/sslcheck.conf create mode 100644 health/health.d/x509check.conf (limited to 'health/health.d') diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf new file mode 100644 index 000000000..d2ef24b58 --- /dev/null +++ b/health/health.d/kubelet.conf @@ -0,0 +1,115 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- + +# True (1) if the node is experiencing a configuration-related error, false (0) otherwise. + + template: node_config_error + on: k8s_kubelet.kubelet_node_config_error + calc: $kubelet_node_config_error + units: bool + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 2h + info: the node is experiencing a configuration-related error + to: sysadmin + +# Failed Token() requests to the alternate token source + + template: token_requests + lookup: sum -10s of token_fail_count + on: k8s_kubelet.kubelet_token_requests + units: failed requests + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 2h + info: failed token requests to alternate token source + to: sysadmin + +# Docker and runtime operation errors + + template: kubelet_operations_error + lookup: sum -1m + on: k8s_kubelet.kubelet_operations_errors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (20)) + delay: up 30s down 1m multiplier 1.5 max 2h + info: operations error + to: sysadmin + +# ----------------------------------------------------------------------------- + +# Pod Lifecycle Event Generator Relisting Latency + +# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) +# 2. do the same for the last 10s +# 3. raise an alarm if the later is: +# - 2x the first for quantile 0.5 +# - 4x the first for quantile 0.9 +# - 8x the first for quantile 0.99 +# +# we assume the minimum latency is 1000 microseconds + +# quantile 0.5 + +template: 1m_kubelet_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 + units: microseconds + every: 10s + info: the average value of pleg relisting latency during the last minute (quantile 0.5) + +template: 10s_kubelet_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 + calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(100):(200)) + crit: $this > (($status >= $WARNING)?(200):(400)) + delay: down 1m multiplier 1.5 max 2h + info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.5) + to: sysadmin + +# quantile 0.9 + +template: 1m_kubelet_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 + units: microseconds + every: 10s + info: the average value of pleg relisting latency during the last minute (quantile 0.9) + +template: 10s_kubelet_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 + calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(400)) + crit: $this > (($status >= $WARNING)?(400):(800)) + delay: down 1m multiplier 1.5 max 2h + info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.9) + to: sysadmin + +# quantile 0.99 + +template: 1m_kubelet_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 + units: microseconds + every: 10s + info: the average value of pleg relisting latency during the last minute (quantile 0.99) + +template: 10s_kubelet_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 + calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(400):(800)) + crit: $this > (($status >= $WARNING)?(800):(1200)) + delay: down 1m multiplier 1.5 max 2h + info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.99) + to: sysadmin diff --git a/health/health.d/sslcheck.conf b/health/health.d/sslcheck.conf deleted file mode 100644 index 29a017e3d..000000000 --- a/health/health.d/sslcheck.conf +++ /dev/null @@ -1,10 +0,0 @@ - -template: sslcheck_days_until_expiration - on: sslcheck.time_until_expiration - calc: $time - units: seconds - every: 60s - warn: $this < $days_until_expiration_warning*24*60*60 - crit: $this < $days_until_expiration_critical*24*60*60 - info: certificate time until expiration - to: webmaster diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf new file mode 100644 index 000000000..dc0e6c695 --- /dev/null +++ b/health/health.d/x509check.conf @@ -0,0 +1,10 @@ + +template: x509check_days_until_expiration + on: x509check.time_until_expiration + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + info: certificate time until expiration + to: webmaster -- cgit v1.2.3