diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/cgroups.conf | 8 | ||||
-rw-r--r-- | health/health.d/consul.conf | 159 | ||||
-rw-r--r-- | health/health.d/disks.conf | 8 | ||||
-rw-r--r-- | health/health.d/dns_query.conf | 2 | ||||
-rw-r--r-- | health/health.d/elasticsearch.conf | 73 | ||||
-rw-r--r-- | health/health.d/fping.conf | 64 | ||||
-rw-r--r-- | health/health.d/httpcheck.conf | 62 | ||||
-rw-r--r-- | health/health.d/kubelet.conf | 20 | ||||
-rw-r--r-- | health/health.d/load.conf | 5 | ||||
-rw-r--r-- | health/health.d/mdstat.conf | 4 | ||||
-rw-r--r-- | health/health.d/net.conf | 28 | ||||
-rw-r--r-- | health/health.d/nvme.conf | 2 | ||||
-rw-r--r-- | health/health.d/ping.conf | 6 | ||||
-rw-r--r-- | health/health.d/portcheck.conf | 6 | ||||
-rw-r--r-- | health/health.d/postgres.conf | 26 | ||||
-rw-r--r-- | health/health.d/zfs.conf | 4 |
16 files changed, 305 insertions, 172 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 4bfe38b65..08260ff6d 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -51,7 +51,7 @@ component: Network lookup: average -1m unaligned of received units: packets every: 10s - info: average number of packets received by the network interface $family over the last minute + info: average number of packets received by the network interface ${label:device} over the last minute template: cgroup_10s_received_packets_storm on: cgroup.net_packets @@ -66,7 +66,7 @@ component: Network warn: $this > (($status >= $WARNING)?(200):(5000)) crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification - info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \ + info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute to: sysadmin @@ -121,7 +121,7 @@ component: Network lookup: average -1m unaligned of received units: packets every: 10s - info: average number of packets received by the network interface $family over the last minute + info: average number of packets received by the network interface ${label:device} over the last minute template: k8s_cgroup_10s_received_packets_storm on: k8s.cgroup.net_packets @@ -136,6 +136,6 @@ component: Network warn: $this > (($status >= $WARNING)?(200):(5000)) crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification - info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \ + info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute to: sysadmin diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf new file mode 100644 index 000000000..dff6d2df3 --- /dev/null +++ b/health/health.d/consul.conf @@ -0,0 +1,159 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: consul_license_expiration_time + on: consul.license_expiration_time + class: Errors + type: ServiceMesh +component: Consul + calc: $license_expiration + every: 60m + units: seconds + warn: $this < 14*24*60*60 + crit: $this < 7*24*60*60 + info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_autopilot_health_status + on: consul.autopilot_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} + to: sysadmin + + template: consul_autopilot_server_health_status + on: consul.autopilot_server_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy + to: sysadmin + + template: consul_raft_leader_last_contact_time + on: consul.raft_leader_last_contact_time + class: Errors + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.5 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (150) : (200)) + crit: $this > (($status == $CRITICAL) ? (200) : (500)) + delay: down 5m multiplier 1.5 max 1h + info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes + to: sysadmin + + template: consul_raft_leadership_transitions + on: consul.raft_leadership_transitions_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: transitions + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader + to: sysadmin + + template: consul_raft_thread_main_saturation + on: consul.raft_thread_main_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: percentage + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_raft_thread_fsm_saturation + on: consul.raft_thread_fsm_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_client_rpc_requests_exceeded + on: consul.client_rpc_requests_exceeded_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_client_rpc_requests_failed + on: consul.client_rpc_requests_failed_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_node_health_check_status + on: consul.node_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 1h + info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_service_health_check_status + on: consul.service_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_gc_pause_time + on: consul.gc_pause_time + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: seconds + warn: $this > (($status >= $WARNING) ? (1) : (2)) + crit: $this > (($status >= $WARNING) ? (2) : (5)) + delay: down 5m multiplier 1.5 max 1h + info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 5daff61a1..fd207fbc1 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -23,7 +23,7 @@ component: Disk warn: $this > (($status >= $WARNING ) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: disk $family space utilization + info: disk ${label:mount_point} space utilization to: sysadmin template: disk_inode_usage @@ -40,7 +40,7 @@ component: Disk warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: disk $family inode utilization + info: disk ${label:mount_point} inode utilization to: sysadmin @@ -147,7 +147,7 @@ component: Disk every: 1m warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h - info: average percentage of time $family disk was busy over the last 10 minutes + info: average percentage of time ${label:device} disk was busy over the last 10 minutes to: silent @@ -169,5 +169,5 @@ component: Disk every: 1m warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h - info: average backlog size of the $family disk over the last 10 minutes + info: average backlog size of the ${label:device} disk over the last 10 minutes to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index b9d6c2374..bf9397d85 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -10,5 +10,5 @@ component: DNS every: 10s warn: $this != nan && $this != 1 delay: up 30s down 5m multiplier 1.5 max 1h - info: DNS request type $label:record_type to server $label:server is unsuccessful + info: DNS request type ${label:record_type} to server ${label:server} is unsuccessful to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf new file mode 100644 index 000000000..47f8e1eb9 --- /dev/null +++ b/health/health.d/elasticsearch.conf @@ -0,0 +1,73 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# 'red' is a threshold, can't lookup the 'red' dimension - using simple pattern is a workaround. + + template: elasticsearch_cluster_health_status_red + on: elasticsearch.cluster_health_status + class: Errors + type: SearchEngine +component: Elasticsearch + lookup: average -5s unaligned of *ed + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: cluster health status is red. + to: sysadmin + +# the idea of '-10m' is to handle yellow status after node restart, +# (usually) no action is required because Elasticsearch will automatically restore the green status. + template: elasticsearch_cluster_health_status_yellow + on: elasticsearch.cluster_health_status + class: Errors + type: SearchEngine +component: Elasticsearch + lookup: average -10m unaligned of yellow + every: 1m + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: cluster health status is yellow. + to: sysadmin + + template: elasticsearch_node_index_health_red + on: elasticsearch.node_index_health + class: Errors + type: SearchEngine +component: Elasticsearch + lookup: average -5s unaligned of *ed + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + info: node index $label:index health status is red. + to: sysadmin + +# don't convert 'lookup' value to seconds in 'calc' due to UI showing seconds as hh:mm:ss (0 as now). + + template: elasticsearch_node_indices_search_time_query + on: elasticsearch.node_indices_search_time + class: Workload + type: SearchEngine +component: Elasticsearch + lookup: average -10m unaligned of query + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (20 * 1000) : (30 * 1000)) + delay: down 5m multiplier 1.5 max 1h + info: search performance is degraded, queries run slowly. + to: sysadmin + + template: elasticsearch_node_indices_search_time_fetch + on: elasticsearch.node_indices_search_time + class: Workload + type: SearchEngine +component: Elasticsearch + lookup: average -10m unaligned of fetch + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (3 * 1000) : (5 * 1000)) + crit: $this > (($status == $CRITICAL) ? (5 * 1000) : (30 * 1000)) + delay: down 5m multiplier 1.5 max 1h + info: search performance is degraded, fetches run slowly. + to: sysadmin diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf deleted file mode 100644 index bb22419fa..000000000 --- a/health/health.d/fping.conf +++ /dev/null @@ -1,64 +0,0 @@ - - template: fping_last_collected_secs - families: * - on: fping.latency - class: Latency - type: Other -component: Network - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - - template: fping_host_reachable - families: * - on: fping.latency - class: Errors - type: Other -component: Network - calc: $average != nan - units: up/down - every: 10s - crit: $this == 0 - delay: down 30m multiplier 1.5 max 2h - info: reachability status of the network host (0: unreachable, 1: reachable) - to: sysadmin - - template: fping_host_latency - families: * - on: fping.latency - class: Latency - type: Other -component: Network - lookup: average -10s unaligned of average - units: ms - every: 10s - green: 500 - red: 1000 - warn: $this > $green OR $max > $red - crit: $this > $red - delay: down 30m multiplier 1.5 max 2h - info: average latency to the network host over the last 10 seconds - to: sysadmin - - template: fping_packet_loss - families: * - on: fping.quality - class: Errors - type: System -component: Network - lookup: average -10m unaligned of returned - calc: 100 - $this - green: 1 - red: 10 - units: % - every: 10s - warn: $this > $green - crit: $this > $red - delay: down 30m multiplier 1.5 max 2h - info: packet loss ratio to the network host over the last 10 minutes - to: sysadmin diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 599c47acc..2008b000d 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -10,7 +10,7 @@ component: HTTP endpoint calc: ($this < 75) ? (0) : ($this) every: 5s units: up/down - info: average ratio of successful HTTP requests over the last minute (at least 75%) + info: HTTP endpoint ${label:url} liveness status to: silent template: httpcheck_web_service_bad_content @@ -25,8 +25,7 @@ component: HTTP endpoint warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average ratio of HTTP responses with unexpected content over the last 5 minutes - options: no-clear-notification + info: percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes to: webmaster template: httpcheck_web_service_bad_status @@ -41,8 +40,7 @@ component: HTTP endpoint warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average ratio of HTTP responses with unexpected status over the last 5 minutes - options: no-clear-notification + info: percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes to: webmaster template: httpcheck_web_service_timeouts @@ -54,9 +52,13 @@ component: HTTP endpoint lookup: average -5m unaligned percentage of timeout every: 10s units: % - info: average ratio of HTTP request timeouts over the last 5 minutes + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes + to: webmaster - template: httpcheck_no_web_service_connections + template: httpcheck_web_service_no_connection families: * on: httpcheck.status class: Errors @@ -65,48 +67,8 @@ component: HTTP endpoint lookup: average -5m unaligned percentage of no_connection every: 10s units: % - info: average ratio of failed requests during the last 5 minutes - -# combined timeout & no connection alarm - template: httpcheck_web_service_unreachable - families: * - on: httpcheck.status - class: Errors - type: Web Server -component: HTTP endpoint - calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) - units: % - every: 10s - warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40) - crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40 - delay: down 5m multiplier 1.5 max 1h - info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes - options: no-clear-notification - to: webmaster - - template: httpcheck_1h_web_service_response_time - families: * - on: httpcheck.responsetime - class: Latency - type: Other -component: HTTP endpoint - lookup: average -1h unaligned of time - every: 30s - units: ms - info: average HTTP response time over the last hour - - template: httpcheck_web_service_slow - families: * - on: httpcheck.responsetime - class: Latency - type: Web Server -component: HTTP endpoint - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($httpcheck_1h_web_service_response_time * 2) ) - crit: ($this > ($httpcheck_1h_web_service_response_time * 3) ) + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average HTTP response time over the last 3 minutes, compared to the average over the last hour - options: no-clear-notification + info: percentage of failed HTTP requests to ${label:url} in the last 5 minutes to: webmaster diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index c2778cc5e..428b6ee91 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -9,7 +9,7 @@ class: Errors type: Kubernetes component: Kubelet - calc: $kubelet_node_config_error + calc: $experiencing_error units: bool every: 10s warn: $this == 1 @@ -20,12 +20,12 @@ component: Kubelet # Failed Token() requests to the alternate token source template: kubelet_token_requests - lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests class: Errors type: Kubernetes component: Kubelet - units: failed requests + lookup: sum -10s of failed + units: requests every: 10s warn: $this > 0 delay: down 1m multiplier 1.5 max 2h @@ -35,11 +35,11 @@ component: Kubelet # Docker and runtime operation errors template: kubelet_operations_error - lookup: sum -1m on: k8s_kubelet.kubelet_operations_errors class: Errors type: Kubernetes component: Kubelet + lookup: sum -1m units: errors every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) @@ -67,7 +67,7 @@ component: Kubelet class: Latency type: Kubernetes component: Kubelet - lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 + lookup: average -1m unaligned of 0.5 units: microseconds every: 10s info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) @@ -77,7 +77,7 @@ component: Kubelet class: Latency type: Kubernetes component: Kubelet - lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 + lookup: average -10s unaligned of 0.5 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s units: % @@ -95,7 +95,7 @@ component: Kubelet class: Latency type: Kubernetes component: Kubelet - lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 + lookup: average -1m unaligned of 0.9 units: microseconds every: 10s info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) @@ -105,7 +105,7 @@ component: Kubelet class: Latency type: Kubernetes component: Kubelet - lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 + lookup: average -10s unaligned of 0.9 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s units: % @@ -123,7 +123,7 @@ component: Kubelet class: Latency type: Kubernetes component: Kubelet - lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 + lookup: average -1m unaligned of 0.99 units: microseconds every: 10s info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) @@ -133,7 +133,7 @@ component: Kubelet class: Latency type: Kubernetes component: Kubelet - lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 + lookup: average -10s unaligned of 0.99 calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s units: % diff --git a/health/health.d/load.conf b/health/health.d/load.conf index 0bd872f85..75989c57f 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -11,7 +11,7 @@ component: Load os: linux hosts: * - calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) + calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) ) units: cpus every: 1m info: number of active CPU cores in the system @@ -28,6 +28,7 @@ component: Load os: linux hosts: * lookup: max -1m unaligned of load15 + calc: ($load_cpu_number == nan) ? (nan) : ($this) units: load every: 1m warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) @@ -43,6 +44,7 @@ component: Load os: linux hosts: * lookup: max -1m unaligned of load5 + calc: ($load_cpu_number == nan) ? (nan) : ($this) units: load every: 1m warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) @@ -58,6 +60,7 @@ component: Load os: linux hosts: * lookup: max -1m unaligned of load1 + calc: ($load_cpu_number == nan) ? (nan) : ($this) units: load every: 1m warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index cedaa000e..ed980a26a 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -20,7 +20,7 @@ component: RAID every: 10s calc: $down crit: $this > 0 - info: number of devices in the down state for the $family array. \ + info: number of devices in the down state for the ${label:device} ${label:raid_level} array. \ Any number > 0 indicates that the array is degraded. to: sysadmin @@ -35,7 +35,7 @@ component: RAID every: 60s warn: $this > 1024 delay: up 30m - info: number of unsynchronized blocks for the $family array + info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array to: sysadmin template: mdstat_nonredundant_last_collected diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 9d5b3b8d3..a0723f303 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -15,7 +15,7 @@ component: Network calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan ) units: Mbit every: 10s - info: network interface $family current speed + info: network interface ${label:device} current speed template: 1m_received_traffic_overflow on: net.net @@ -31,7 +31,7 @@ component: Network every: 10s warn: $this > (($status >= $WARNING) ? (85) : (90)) delay: up 1m down 1m multiplier 1.5 max 1h - info: average inbound utilization for the network interface $family over the last minute + info: average inbound utilization for the network interface ${label:device} over the last minute to: sysadmin template: 1m_sent_traffic_overflow @@ -48,7 +48,7 @@ component: Network every: 10s warn: $this > (($status >= $WARNING) ? (85) : (90)) delay: up 1m down 1m multiplier 1.5 max 1h - info: average outbound utilization for the network interface $family over the last minute + info: average outbound utilization for the network interface ${label:device} over the last minute to: sysadmin # ----------------------------------------------------------------------------- @@ -72,7 +72,7 @@ component: Network lookup: sum -10m unaligned absolute of inbound units: packets every: 1m - info: number of inbound dropped packets for the network interface $family in the last 10 minutes + info: number of inbound dropped packets for the network interface ${label:device} in the last 10 minutes template: outbound_packets_dropped on: net.drops @@ -85,7 +85,7 @@ component: Network lookup: sum -10m unaligned absolute of outbound units: packets every: 1m - info: number of outbound dropped packets for the network interface $family in the last 10 minutes + info: number of outbound dropped packets for the network interface ${label:device} in the last 10 minutes template: inbound_packets_dropped_ratio on: net.packets @@ -101,7 +101,7 @@ component: Network every: 1m warn: $this >= 2 delay: up 1m down 1h multiplier 1.5 max 2h - info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes + info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes to: sysadmin template: outbound_packets_dropped_ratio @@ -118,7 +118,7 @@ component: Network every: 1m warn: $this >= 2 delay: up 1m down 1h multiplier 1.5 max 2h - info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes + info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes to: sysadmin template: wifi_inbound_packets_dropped_ratio @@ -135,7 +135,7 @@ component: Network every: 1m warn: $this >= 10 delay: up 1m down 1h multiplier 1.5 max 2h - info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes + info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes to: sysadmin template: wifi_outbound_packets_dropped_ratio @@ -152,7 +152,7 @@ component: Network every: 1m warn: $this >= 10 delay: up 1m down 1h multiplier 1.5 max 2h - info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes + info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -171,7 +171,7 @@ component: Network every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: number of inbound errors for the network interface $family in the last 10 minutes + info: number of inbound errors for the network interface ${label:device} in the last 10 minutes to: sysadmin template: interface_outbound_errors @@ -187,7 +187,7 @@ component: Network every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: number of outbound errors for the network interface $family in the last 10 minutes + info: number of outbound errors for the network interface ${label:device} in the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -211,7 +211,7 @@ component: Network every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 2h - info: number of FIFO errors for the network interface $family in the last 10 minutes + info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -234,7 +234,7 @@ component: Network lookup: average -1m unaligned of received units: packets every: 10s - info: average number of packets received by the network interface $family over the last minute + info: average number of packets received by the network interface ${label:device} over the last minute template: 10s_received_packets_storm on: net.packets @@ -251,6 +251,6 @@ component: Network warn: $this > (($status >= $WARNING)?(200):(5000)) crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification - info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \ + info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute to: sysadmin diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf index 5f729d52b..b7c0e6fd4 100644 --- a/health/health.d/nvme.conf +++ b/health/health.d/nvme.conf @@ -11,5 +11,5 @@ component: Disk every: 10s crit: $this != nan AND $this != 0 delay: down 5m multiplier 1.5 max 2h - info: NVMe device $label:device has critical warnings + info: NVMe device ${label:device} has critical warnings to: sysadmin diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf index cbe7c30c9..fa8213ad3 100644 --- a/health/health.d/ping.conf +++ b/health/health.d/ping.conf @@ -12,7 +12,7 @@ component: Network every: 10s crit: $this == 0 delay: down 30m multiplier 1.5 max 2h - info: network host $label:host reachability status + info: network host ${label:host} reachability status to: sysadmin template: ping_packet_loss @@ -29,7 +29,7 @@ component: Network warn: $this > $green crit: $this > $red delay: down 30m multiplier 1.5 max 2h - info: packet loss percentage to the network host $label:host over the last 10 minutes + info: packet loss percentage to the network host ${label:host} over the last 10 minutes to: sysadmin template: ping_host_latency @@ -46,5 +46,5 @@ component: Network warn: $this > $green OR $max > $red crit: $this > $red delay: down 30m multiplier 1.5 max 2h - info: average latency to the network host $label:host over the last 10 seconds + info: average latency to the network host ${label:host} over the last 10 seconds to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index 8cbd7729c..e8908404c 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -10,7 +10,7 @@ component: TCP endpoint calc: ($this < 75) ? (0) : ($this) every: 5s units: up/down - info: average ratio of successful connections over the last minute (at least 75%) + info: TCP host ${label:host} port ${label:port} liveness status to: silent template: portcheck_connection_timeouts @@ -25,7 +25,7 @@ component: TCP endpoint warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average ratio of timeouts over the last 5 minutes + info: percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes to: sysadmin template: portcheck_connection_fails @@ -40,5 +40,5 @@ component: TCP endpoint warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average ratio of failed connections over the last 5 minutes + info: percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf index 66d034cfe..67b25673b 100644 --- a/health/health.d/postgres.conf +++ b/health/health.d/postgres.conf @@ -58,7 +58,7 @@ component: PostgreSQL warn: $this < (($status >= $WARNING) ? (70) : (60)) crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average cache hit ratio in db $label:database over the last minute + info: average cache hit ratio in db ${label:database} over the last minute to: dba template: postgres_db_transactions_rollback_ratio @@ -72,7 +72,7 @@ component: PostgreSQL every: 1m warn: $this > (($status >= $WARNING) ? (0) : (2)) delay: down 15m multiplier 1.5 max 1h - info: average aborted transactions percentage in db $label:database over the last five minutes + info: average aborted transactions percentage in db ${label:database} over the last five minutes to: dba template: postgres_db_deadlocks_rate @@ -86,7 +86,7 @@ component: PostgreSQL every: 1m warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 15m multiplier 1.5 max 1h - info: number of deadlocks detected in db $label:database in the last minute + info: number of deadlocks detected in db ${label:database} in the last minute to: dba # Table alarms @@ -104,7 +104,7 @@ component: PostgreSQL warn: $this < (($status >= $WARNING) ? (70) : (60)) crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average cache hit ratio in db $label:database table $label:table over the last minute + info: average cache hit ratio in db ${label:database} table ${label:table} over the last minute to: dba template: postgres_table_index_cache_io_ratio @@ -120,7 +120,7 @@ component: PostgreSQL warn: $this < (($status >= $WARNING) ? (70) : (60)) crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average index cache hit ratio in db $label:database table $label:table over the last minute + info: average index cache hit ratio in db ${label:database} table ${label:table} over the last minute to: dba template: postgres_table_toast_cache_io_ratio @@ -136,7 +136,7 @@ component: PostgreSQL warn: $this < (($status >= $WARNING) ? (70) : (60)) crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average TOAST hit ratio in db $label:database table $label:table over the last minute + info: average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute to: dba template: postgres_table_toast_index_cache_io_ratio @@ -152,7 +152,7 @@ component: PostgreSQL warn: $this < (($status >= $WARNING) ? (70) : (60)) crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average index TOAST hit ratio in db $label:database table $label:table over the last minute + info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute to: dba template: postgres_table_bloat_size_perc @@ -161,13 +161,13 @@ component: PostgreSQL type: Database component: PostgreSQL hosts: * - calc: $bloat + calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0) units: % every: 1m warn: $this > (($status >= $WARNING) ? (60) : (70)) crit: $this > (($status == $CRITICAL) ? (70) : (80)) delay: down 15m multiplier 1.5 max 1h - info: bloat size percentage in db $label:database table $label:table + info: bloat size percentage in db ${label:database} table ${label:table} to: dba template: postgres_table_last_autovacuum_time @@ -180,7 +180,7 @@ component: PostgreSQL units: seconds every: 1m warn: $this != nan AND $this > (60 * 60 * 24 * 7) - info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon + info: time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon to: dba template: postgres_table_last_autoanalyze_time @@ -193,7 +193,7 @@ component: PostgreSQL units: seconds every: 1m warn: $this != nan AND $this > (60 * 60 * 24 * 7) - info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon + info: time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon to: dba # Index alarms @@ -204,11 +204,11 @@ component: PostgreSQL type: Database component: PostgreSQL hosts: * - calc: $bloat + calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0) units: % every: 1m warn: $this > (($status >= $WARNING) ? (60) : (70)) crit: $this > (($status == $CRITICAL) ? (70) : (80)) delay: down 15m multiplier 1.5 max 1h - info: bloat size percentage in db $label:database table $label:table index $label:index + info: bloat size percentage in db ${label:database} table ${label:table} index ${label:index} to: dba diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf index 785838d47..7f8ea2793 100644 --- a/health/health.d/zfs.conf +++ b/health/health.d/zfs.conf @@ -24,7 +24,7 @@ component: File system every: 10s warn: $this > 0 delay: down 1m multiplier 1.5 max 1h - info: ZFS pool $family state is degraded + info: ZFS pool ${label:pool} state is degraded to: sysadmin template: zfs_pool_state_crit @@ -37,5 +37,5 @@ component: File system every: 10s crit: $this > 0 delay: down 1m multiplier 1.5 max 1h - info: ZFS pool $family state is faulted or unavail + info: ZFS pool ${label:pool} state is faulted or unavail to: sysadmin |