Merging upstream version 1.38.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-02-06 16:11:34 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-02-06 16:11:34 +0000
commit: d079b656b4719739b2247dcd9d46e9bec793095a (patch)
tree: d2c950c70a776bcf697c963151c5bd959f8a9f03 /health/health.d
parent: Releasing debian version 1.37.1-2. (diff)
download: netdata-d079b656b4719739b2247dcd9d46e9bec793095a.tar.xz
netdata-d079b656b4719739b2247dcd9d46e9bec793095a.zip
16 files changed, 305 insertions, 172 deletions
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 4bfe38b6..08260ff6 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -51,7 +51,7 @@ component: Network
    lookup: average -1m unaligned of received
     units: packets
     every: 10s
-     info: average number of packets received by the network interface $family over the last minute
+     info: average number of packets received by the network interface ${label:device} over the last minute
 
  template: cgroup_10s_received_packets_storm
        on: cgroup.net_packets
@@ -66,7 +66,7 @@ component: Network
      warn: $this > (($status >= $WARNING)?(200):(5000))
      crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
-     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
        to: sysadmin
 
@@ -121,7 +121,7 @@ component: Network
    lookup: average -1m unaligned of received
     units: packets
     every: 10s
-     info: average number of packets received by the network interface $family over the last minute
+     info: average number of packets received by the network interface ${label:device} over the last minute
 
  template: k8s_cgroup_10s_received_packets_storm
        on: k8s.cgroup.net_packets
@@ -136,6 +136,6 @@ component: Network
      warn: $this > (($status >= $WARNING)?(200):(5000))
      crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
-     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
        to: sysadmin
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
new file mode 100644
index 00000000..dff6d2df
--- /dev/null
+++ b/health/health.d/consul.conf
@@ -0,0 +1,159 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: consul_license_expiration_time
+       on: consul.license_expiration_time
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $license_expiration
+    every: 60m
+    units: seconds
+     warn: $this < 14*24*60*60
+     crit: $this < 7*24*60*60
+     info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_autopilot_health_status
+       on: consul.autopilot_health_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $unhealthy
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
+       to: sysadmin
+
+ template: consul_autopilot_server_health_status
+       on: consul.autopilot_server_health_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $unhealthy
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
+       to: sysadmin
+
+ template: consul_raft_leader_last_contact_time
+       on: consul.raft_leader_last_contact_time
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: average -1m unaligned of quantile_0.5
+    every: 10s
+    units: milliseconds
+     warn: $this > (($status >= $WARNING)  ? (150) : (200))
+     crit: $this > (($status == $CRITICAL) ? (200) : (500))
+    delay: down 5m multiplier 1.5 max 1h
+     info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
+       to: sysadmin
+
+ template: consul_raft_leadership_transitions
+       on: consul.raft_leadership_transitions_rate
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: transitions
+     warn: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
+       to: sysadmin
+
+ template: consul_raft_thread_main_saturation
+       on: consul.raft_thread_main_saturation_perc
+    class: Utilization
+     type: ServiceMesh
+component: Consul
+   lookup: average -1m unaligned of quantile_0.9
+    every: 10s
+    units: percentage
+     warn: $this > (($status >= $WARNING)  ? (40) : (50))
+    delay: down 5m multiplier 1.5 max 1h
+     info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_raft_thread_fsm_saturation
+       on: consul.raft_thread_fsm_saturation_perc
+    class: Utilization
+     type: ServiceMesh
+component: Consul
+   lookup: average -1m unaligned of quantile_0.9
+    every: 10s
+    units: milliseconds
+     warn: $this > (($status >= $WARNING)  ? (40) : (50))
+    delay: down 5m multiplier 1.5 max 1h
+     info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_client_rpc_requests_exceeded
+       on: consul.client_rpc_requests_exceeded_rate
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: requests
+     warn: $this > (($status >= $WARNING)  ? (0) : (5))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_client_rpc_requests_failed
+       on: consul.client_rpc_requests_failed_rate
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: requests
+     warn: $this > (($status >= $WARNING)  ? (0) : (5))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_node_health_check_status
+       on: consul.node_health_check_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $warning + $critical
+    every: 10s
+    units: status
+     warn: $this != nan AND $this != 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_service_health_check_status
+       on: consul.service_health_check_status
+    class: Errors
+     type: ServiceMesh
+component: Consul
+     calc: $warning + $critical
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
+
+ template: consul_gc_pause_time
+       on: consul.gc_pause_time
+    class: Errors
+     type: ServiceMesh
+component: Consul
+   lookup: sum -1m unaligned
+    every: 10s
+    units: seconds
+     warn: $this > (($status >= $WARNING)  ? (1) : (2))
+     crit: $this > (($status >= $WARNING)  ? (2) : (5))
+    delay: down 5m multiplier 1.5 max 1h
+     info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
+       to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 5daff61a..fd207fbc 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -23,7 +23,7 @@ component: Disk
      warn: $this > (($status >= $WARNING ) ? (80) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: disk $family space utilization
+     info: disk ${label:mount_point} space utilization
        to: sysadmin
 
  template: disk_inode_usage
@@ -40,7 +40,7 @@ component: Disk
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: up 1m down 15m multiplier 1.5 max 1h
-     info: disk $family inode utilization
+     info: disk ${label:mount_point} inode utilization
        to: sysadmin
 
 
@@ -147,7 +147,7 @@ component: Disk
     every: 1m
      warn: $this > 98 * (($status >= $WARNING)  ? (0.7) : (1))
     delay: down 15m multiplier 1.2 max 1h
-     info: average percentage of time $family disk was busy over the last 10 minutes
+     info: average percentage of time ${label:device} disk was busy over the last 10 minutes
        to: silent
 
 
@@ -169,5 +169,5 @@ component: Disk
     every: 1m
      warn: $this > 5000 * (($status >= $WARNING)  ? (0.7) : (1))
     delay: down 15m multiplier 1.2 max 1h
-     info: average backlog size of the $family disk over the last 10 minutes
+     info: average backlog size of the ${label:device} disk over the last 10 minutes
        to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index b9d6c237..bf9397d8 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -10,5 +10,5 @@ component: DNS
     every: 10s
      warn: $this != nan && $this != 1
     delay: up 30s down 5m multiplier 1.5 max 1h
-     info: DNS request type $label:record_type to server $label:server is unsuccessful
+     info: DNS request type ${label:record_type} to server ${label:server} is unsuccessful
        to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
new file mode 100644
index 00000000..47f8e1eb
--- /dev/null
+++ b/health/health.d/elasticsearch.conf
@@ -0,0 +1,73 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# 'red' is a threshold, can't lookup the 'red' dimension - using simple pattern is a workaround.
+
+ template: elasticsearch_cluster_health_status_red
+       on: elasticsearch.cluster_health_status
+    class: Errors
+     type: SearchEngine
+component: Elasticsearch
+   lookup: average -5s unaligned of *ed
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: cluster health status is red.
+       to: sysadmin
+
+# the idea of '-10m' is to handle yellow status after node restart,
+# (usually) no action is required because Elasticsearch will automatically restore the green status.
+ template: elasticsearch_cluster_health_status_yellow
+       on: elasticsearch.cluster_health_status
+    class: Errors
+     type: SearchEngine
+component: Elasticsearch
+   lookup: average -10m unaligned of yellow
+    every: 1m
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: cluster health status is yellow.
+       to: sysadmin
+
+ template: elasticsearch_node_index_health_red
+       on: elasticsearch.node_index_health
+    class: Errors
+     type: SearchEngine
+component: Elasticsearch
+   lookup: average -5s unaligned of *ed
+    every: 10s
+    units: status
+     warn: $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: node index $label:index health status is red.
+       to: sysadmin
+
+# don't convert 'lookup' value to seconds in 'calc' due to UI showing seconds as hh:mm:ss (0 as now).
+
+ template: elasticsearch_node_indices_search_time_query
+       on: elasticsearch.node_indices_search_time
+    class: Workload
+     type: SearchEngine
+component: Elasticsearch
+   lookup: average -10m unaligned of query
+    every: 10s
+    units: milliseconds
+     warn: $this > (($status >= $WARNING)  ? (20 * 1000) : (30 * 1000))
+    delay: down 5m multiplier 1.5 max 1h
+     info: search performance is degraded, queries run slowly.
+       to: sysadmin
+
+ template: elasticsearch_node_indices_search_time_fetch
+       on: elasticsearch.node_indices_search_time
+    class: Workload
+     type: SearchEngine
+component: Elasticsearch
+   lookup: average -10m unaligned of fetch
+    every: 10s
+    units: milliseconds
+     warn: $this > (($status >= $WARNING)  ? (3 * 1000) : (5 * 1000))
+     crit: $this > (($status == $CRITICAL) ? (5 * 1000) : (30 * 1000))
+    delay: down 5m multiplier 1.5 max 1h
+     info: search performance is degraded, fetches run slowly.
+       to: sysadmin
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
deleted file mode 100644
index bb22419f..00000000
--- a/health/health.d/fping.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-
- template: fping_last_collected_secs
- families: *
-       on: fping.latency
-    class: Latency
-     type: Other
-component: Network
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    delay: down 5m multiplier 1.5 max 1h
-     info: number of seconds since the last successful data collection
-       to: sysadmin
-
- template: fping_host_reachable
- families: *
-       on: fping.latency
-    class: Errors
-     type: Other
-component: Network
-     calc: $average != nan
-    units: up/down
-    every: 10s
-     crit: $this == 0
-    delay: down 30m multiplier 1.5 max 2h
-     info: reachability status of the network host (0: unreachable, 1: reachable)
-       to: sysadmin
-
- template: fping_host_latency
- families: *
-       on: fping.latency
-    class: Latency
-     type: Other
-component: Network
-   lookup: average -10s unaligned of average
-    units: ms
-    every: 10s
-    green: 500
-      red: 1000
-     warn: $this > $green OR $max > $red
-     crit: $this > $red
-    delay: down 30m multiplier 1.5 max 2h
-     info: average latency to the network host over the last 10 seconds
-       to: sysadmin
-
- template: fping_packet_loss
- families: *
-       on: fping.quality
-    class: Errors
-     type: System
-component: Network
-   lookup: average -10m unaligned of returned
-     calc: 100 - $this
-    green: 1
-      red: 10
-    units: %
-    every: 10s
-     warn: $this > $green
-     crit: $this > $red
-    delay: down 30m multiplier 1.5 max 2h
-     info: packet loss ratio to the network host over the last 10 minutes
-       to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 599c47ac..2008b000 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -10,7 +10,7 @@ component: HTTP endpoint
      calc: ($this < 75) ? (0) : ($this)
     every: 5s
     units: up/down
-     info: average ratio of successful HTTP requests over the last minute (at least 75%)
+     info: HTTP endpoint ${label:url} liveness status
        to: silent
 
  template: httpcheck_web_service_bad_content
@@ -25,8 +25,7 @@ component: HTTP endpoint
      warn: $this >= 10 AND $this < 40
      crit: $this >= 40
     delay: down 5m multiplier 1.5 max 1h
-     info: average ratio of HTTP responses with unexpected content over the last 5 minutes
-  options: no-clear-notification
+     info: percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes
        to: webmaster
 
  template: httpcheck_web_service_bad_status
@@ -41,8 +40,7 @@ component: HTTP endpoint
      warn: $this >= 10 AND $this < 40
      crit: $this >= 40
     delay: down 5m multiplier 1.5 max 1h
-     info: average ratio of HTTP responses with unexpected status over the last 5 minutes
-  options: no-clear-notification
+     info: percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes
        to: webmaster
 
  template: httpcheck_web_service_timeouts
@@ -54,9 +52,13 @@ component: HTTP endpoint
    lookup: average -5m unaligned percentage of timeout
     every: 10s
     units: %
-     info: average ratio of HTTP request timeouts over the last 5 minutes
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes
+       to: webmaster
 
- template: httpcheck_no_web_service_connections
+ template: httpcheck_web_service_no_connection
  families: *
        on: httpcheck.status
     class: Errors
@@ -65,48 +67,8 @@ component: HTTP endpoint
    lookup: average -5m unaligned percentage of no_connection
     every: 10s
     units: %
-     info: average ratio of failed requests during the last 5 minutes
-
-# combined timeout & no connection alarm
- template: httpcheck_web_service_unreachable
- families: *
-       on: httpcheck.status
-    class: Errors
-     type: Web Server
-component: HTTP endpoint
-     calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
-    units: %
-    every: 10s
-     warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
-     crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
-    delay: down 5m multiplier 1.5 max 1h
-     info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
-  options: no-clear-notification
-       to: webmaster
-
- template: httpcheck_1h_web_service_response_time
- families: *
-       on: httpcheck.responsetime
-    class: Latency
-     type: Other
-component: HTTP endpoint
-   lookup: average -1h unaligned of time
-    every: 30s
-    units: ms
-     info: average HTTP response time over the last hour
-
- template: httpcheck_web_service_slow
- families: *
-       on: httpcheck.responsetime
-    class: Latency
-     type: Web Server
-component: HTTP endpoint
-   lookup: average -3m unaligned of time
-    units: ms
-    every: 10s
-     warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
-     crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
     delay: down 5m multiplier 1.5 max 1h
-     info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
-  options: no-clear-notification
+     info: percentage of failed HTTP requests to ${label:url} in the last 5 minutes
        to: webmaster
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index c2778cc5..428b6ee9 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -9,7 +9,7 @@
     class: Errors
      type: Kubernetes
 component: Kubelet
-     calc: $kubelet_node_config_error
+     calc: $experiencing_error
     units: bool
     every: 10s
      warn: $this == 1
@@ -20,12 +20,12 @@ component: Kubelet
 # Failed Token() requests to the alternate token source
 
  template: kubelet_token_requests
-   lookup: sum -10s of token_fail_count
        on: k8s_kubelet.kubelet_token_requests
     class: Errors
      type: Kubernetes
 component: Kubelet
-    units: failed requests
+   lookup: sum -10s of failed
+    units: requests
     every: 10s
      warn: $this > 0
     delay: down 1m multiplier 1.5 max 2h
@@ -35,11 +35,11 @@ component: Kubelet
 # Docker and runtime operation errors
 
  template: kubelet_operations_error
-   lookup: sum -1m
        on: k8s_kubelet.kubelet_operations_errors
     class: Errors
      type: Kubernetes
 component: Kubelet
+   lookup: sum -1m
     units: errors
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (0) : (20))
@@ -67,7 +67,7 @@ component: Kubelet
     class: Latency
      type: Kubernetes
 component: Kubelet
-   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
+   lookup: average -1m unaligned of 0.5
     units: microseconds
     every: 10s
      info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
@@ -77,7 +77,7 @@ component: Kubelet
     class: Latency
      type: Kubernetes
 component: Kubelet
-   lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
+   lookup: average -10s unaligned of 0.5
      calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
     every: 10s
     units: %
@@ -95,7 +95,7 @@ component: Kubelet
     class: Latency
      type: Kubernetes
 component: Kubelet
-   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
+   lookup: average -1m unaligned of 0.9
     units: microseconds
     every: 10s
      info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
@@ -105,7 +105,7 @@ component: Kubelet
     class: Latency
      type: Kubernetes
 component: Kubelet
-   lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
+   lookup: average -10s unaligned of 0.9
      calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
     every: 10s
     units: %
@@ -123,7 +123,7 @@ component: Kubelet
     class: Latency
      type: Kubernetes
 component: Kubelet
-   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
+   lookup: average -1m unaligned of 0.99
     units: microseconds
     every: 10s
      info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
@@ -133,7 +133,7 @@ component: Kubelet
     class: Latency
      type: Kubernetes
 component: Kubelet
-   lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
+   lookup: average -10s unaligned of 0.99
      calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
     every: 10s
     units: %
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index 0bd872f8..75989c57 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -11,7 +11,7 @@
 component: Load
        os: linux
     hosts: *
-     calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+     calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) )
     units: cpus
     every: 1m
      info: number of active CPU cores in the system
@@ -28,6 +28,7 @@ component: Load
        os: linux
     hosts: *
    lookup: max -1m unaligned of load15
+     calc: ($load_cpu_number == nan) ? (nan) : ($this)
     units: load
     every: 1m
      warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
@@ -43,6 +44,7 @@ component: Load
        os: linux
     hosts: *
    lookup: max -1m unaligned of load5
+     calc: ($load_cpu_number == nan) ? (nan) : ($this)
     units: load
     every: 1m
      warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
@@ -58,6 +60,7 @@ component: Load
        os: linux
     hosts: *
    lookup: max -1m unaligned of load1
+     calc: ($load_cpu_number == nan) ? (nan) : ($this)
     units: load
     every: 1m
      warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index cedaa000..ed980a26 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -20,7 +20,7 @@ component: RAID
     every: 10s
      calc: $down
      crit: $this > 0
-     info: number of devices in the down state for the $family array. \
+     info: number of devices in the down state for the ${label:device} ${label:raid_level} array. \
            Any number > 0 indicates that the array is degraded.
        to: sysadmin
 
@@ -35,7 +35,7 @@ component: RAID
     every: 60s
      warn: $this > 1024
     delay: up 30m
-     info: number of unsynchronized blocks for the $family array
+     info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array
        to: sysadmin
 
  template: mdstat_nonredundant_last_collected
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 9d5b3b8d..a0723f30 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -15,7 +15,7 @@ component: Network
      calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
     units: Mbit
     every: 10s
-     info: network interface $family current speed
+     info: network interface ${label:device} current speed
 
  template: 1m_received_traffic_overflow
        on: net.net
@@ -31,7 +31,7 @@ component: Network
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (85) : (90))
     delay: up 1m down 1m multiplier 1.5 max 1h
-     info: average inbound utilization for the network interface $family over the last minute
+     info: average inbound utilization for the network interface ${label:device} over the last minute
        to: sysadmin
 
  template: 1m_sent_traffic_overflow
@@ -48,7 +48,7 @@ component: Network
     every: 10s
      warn: $this > (($status >= $WARNING)  ? (85) : (90))
     delay: up 1m down 1m multiplier 1.5 max 1h
-     info: average outbound utilization for the network interface $family over the last minute
+     info: average outbound utilization for the network interface ${label:device} over the last minute
        to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -72,7 +72,7 @@ component: Network
    lookup: sum -10m unaligned absolute of inbound
     units: packets
     every: 1m
-     info: number of inbound dropped packets for the network interface $family in the last 10 minutes
+     info: number of inbound dropped packets for the network interface ${label:device} in the last 10 minutes
 
  template: outbound_packets_dropped
        on: net.drops
@@ -85,7 +85,7 @@ component: Network
    lookup: sum -10m unaligned absolute of outbound
     units: packets
     every: 1m
-     info: number of outbound dropped packets for the network interface $family in the last 10 minutes
+     info: number of outbound dropped packets for the network interface ${label:device} in the last 10 minutes
 
  template: inbound_packets_dropped_ratio
        on: net.packets
@@ -101,7 +101,7 @@ component: Network
     every: 1m
      warn: $this >= 2
     delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+     info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
        to: sysadmin
 
  template: outbound_packets_dropped_ratio
@@ -118,7 +118,7 @@ component: Network
     every: 1m
      warn: $this >= 2
     delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+     info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
        to: sysadmin
 
  template: wifi_inbound_packets_dropped_ratio
@@ -135,7 +135,7 @@ component: Network
     every: 1m
      warn: $this >= 10
     delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+     info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
        to: sysadmin
 
  template: wifi_outbound_packets_dropped_ratio
@@ -152,7 +152,7 @@ component: Network
     every: 1m
      warn: $this >= 10
     delay: up 1m down 1h multiplier 1.5 max 2h
-     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+     info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
        to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -171,7 +171,7 @@ component: Network
     every: 1m
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
-     info: number of inbound errors for the network interface $family in the last 10 minutes
+     info: number of inbound errors for the network interface ${label:device} in the last 10 minutes
        to: sysadmin
 
  template: interface_outbound_errors
@@ -187,7 +187,7 @@ component: Network
     every: 1m
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
-     info: number of outbound errors for the network interface $family in the last 10 minutes
+     info: number of outbound errors for the network interface ${label:device} in the last 10 minutes
        to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -211,7 +211,7 @@ component: Network
     every: 1m
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 2h
-     info: number of FIFO errors for the network interface $family in the last 10 minutes
+     info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes
        to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -234,7 +234,7 @@ component: Network
    lookup: average -1m unaligned of received
     units: packets
     every: 10s
-     info: average number of packets received by the network interface $family over the last minute
+     info: average number of packets received by the network interface ${label:device} over the last minute
 
  template: 10s_received_packets_storm
        on: net.packets
@@ -251,6 +251,6 @@ component: Network
      warn: $this > (($status >= $WARNING)?(200):(5000))
      crit: $this > (($status == $CRITICAL)?(5000):(6000))
   options: no-clear-notification
-     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
        to: sysadmin
diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf
index 5f729d52..b7c0e6fd 100644
--- a/health/health.d/nvme.conf
+++ b/health/health.d/nvme.conf
@@ -11,5 +11,5 @@ component: Disk
     every: 10s
      crit: $this != nan AND $this != 0
     delay: down 5m multiplier 1.5 max 2h
-     info: NVMe device $label:device has critical warnings
+     info: NVMe device ${label:device} has critical warnings
        to: sysadmin
diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf
index cbe7c30c..fa8213ad 100644
--- a/health/health.d/ping.conf
+++ b/health/health.d/ping.conf
@@ -12,7 +12,7 @@ component: Network
     every: 10s
      crit: $this == 0
     delay: down 30m multiplier 1.5 max 2h
-     info: network host $label:host reachability status
+     info: network host ${label:host} reachability status
        to: sysadmin
 
  template: ping_packet_loss
@@ -29,7 +29,7 @@ component: Network
      warn: $this > $green
      crit: $this > $red
     delay: down 30m multiplier 1.5 max 2h
-     info: packet loss percentage to the network host $label:host over the last 10 minutes
+     info: packet loss percentage to the network host ${label:host} over the last 10 minutes
        to: sysadmin
 
  template: ping_host_latency
@@ -46,5 +46,5 @@ component: Network
      warn: $this > $green OR $max > $red
      crit: $this > $red
     delay: down 30m multiplier 1.5 max 2h
-     info: average latency to the network host $label:host over the last 10 seconds
+     info: average latency to the network host ${label:host} over the last 10 seconds
        to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index 8cbd7729..e8908404 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -10,7 +10,7 @@ component: TCP endpoint
      calc: ($this < 75) ? (0) : ($this)
     every: 5s
     units: up/down
-     info: average ratio of successful connections over the last minute (at least 75%)
+     info: TCP host ${label:host} port ${label:port} liveness status
        to: silent
 
  template: portcheck_connection_timeouts
@@ -25,7 +25,7 @@ component: TCP endpoint
      warn: $this >= 10 AND $this < 40
      crit: $this >= 40
     delay: down 5m multiplier 1.5 max 1h
-     info: average ratio of timeouts over the last 5 minutes
+     info: percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
        to: sysadmin
 
  template: portcheck_connection_fails
@@ -40,5 +40,5 @@ component: TCP endpoint
      warn: $this >= 10 AND $this < 40
      crit: $this >= 40
     delay: down 5m multiplier 1.5 max 1h
-     info: average ratio of failed connections over the last 5 minutes
+     info: percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
        to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
index 66d034cf..67b25673 100644
--- a/health/health.d/postgres.conf
+++ b/health/health.d/postgres.conf
@@ -58,7 +58,7 @@ component: PostgreSQL
      warn: $this < (($status >= $WARNING)  ? (70) : (60))
      crit: $this < (($status == $CRITICAL) ? (60) : (50))
     delay: down 15m multiplier 1.5 max 1h
-     info: average cache hit ratio in db $label:database over the last minute
+     info: average cache hit ratio in db ${label:database} over the last minute
        to: dba
 
  template: postgres_db_transactions_rollback_ratio	
@@ -72,7 +72,7 @@ component: PostgreSQL
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (0) : (2))
     delay: down 15m multiplier 1.5 max 1h
-     info: average aborted transactions percentage in db $label:database over the last five minutes
+     info: average aborted transactions percentage in db ${label:database} over the last five minutes
        to: dba
 
  template: postgres_db_deadlocks_rate
@@ -86,7 +86,7 @@ component: PostgreSQL
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (0) : (10))
     delay: down 15m multiplier 1.5 max 1h
-     info: number of deadlocks detected in db $label:database in the last minute
+     info: number of deadlocks detected in db ${label:database} in the last minute
        to: dba
 
 # Table alarms
@@ -104,7 +104,7 @@ component: PostgreSQL
      warn: $this < (($status >= $WARNING)  ? (70) : (60))
      crit: $this < (($status == $CRITICAL) ? (60) : (50))
     delay: down 15m multiplier 1.5 max 1h
-     info: average cache hit ratio in db $label:database table $label:table over the last minute
+     info: average cache hit ratio in db ${label:database} table ${label:table} over the last minute
        to: dba
 
  template: postgres_table_index_cache_io_ratio
@@ -120,7 +120,7 @@ component: PostgreSQL
      warn: $this < (($status >= $WARNING)  ? (70) : (60))
      crit: $this < (($status == $CRITICAL) ? (60) : (50))
     delay: down 15m multiplier 1.5 max 1h
-     info: average index cache hit ratio in db $label:database table $label:table over the last minute
+     info: average index cache hit ratio in db ${label:database} table ${label:table} over the last minute
        to: dba
 
  template: postgres_table_toast_cache_io_ratio
@@ -136,7 +136,7 @@ component: PostgreSQL
      warn: $this < (($status >= $WARNING)  ? (70) : (60))
      crit: $this < (($status == $CRITICAL) ? (60) : (50))
     delay: down 15m multiplier 1.5 max 1h
-     info: average TOAST hit ratio in db $label:database table $label:table over the last minute
+     info: average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
        to: dba
 
  template: postgres_table_toast_index_cache_io_ratio
@@ -152,7 +152,7 @@ component: PostgreSQL
      warn: $this < (($status >= $WARNING)  ? (70) : (60))
      crit: $this < (($status == $CRITICAL) ? (60) : (50))
     delay: down 15m multiplier 1.5 max 1h
-     info: average index TOAST hit ratio in db $label:database table $label:table over the last minute
+     info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
        to: dba
 
  template: postgres_table_bloat_size_perc
@@ -161,13 +161,13 @@ component: PostgreSQL
      type: Database
 component: PostgreSQL
     hosts: *
-     calc: $bloat
+     calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0)
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (60) : (70))
      crit: $this > (($status == $CRITICAL) ? (70) : (80))
     delay: down 15m multiplier 1.5 max 1h
-     info: bloat size percentage in db $label:database table $label:table
+     info: bloat size percentage in db ${label:database} table ${label:table}
        to: dba
 
  template: postgres_table_last_autovacuum_time
@@ -180,7 +180,7 @@ component: PostgreSQL
     units: seconds
     every: 1m
      warn: $this != nan AND $this > (60 * 60 * 24 * 7)
-     info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon
+     info: time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon
        to: dba
 
  template: postgres_table_last_autoanalyze_time
@@ -193,7 +193,7 @@ component: PostgreSQL
     units: seconds
     every: 1m
      warn: $this != nan AND $this > (60 * 60 * 24 * 7)
-     info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon
+     info: time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon
        to: dba
 
 # Index alarms
@@ -204,11 +204,11 @@ component: PostgreSQL
      type: Database
 component: PostgreSQL
     hosts: *
-     calc: $bloat
+     calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0)
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (60) : (70))
      crit: $this > (($status == $CRITICAL) ? (70) : (80))
     delay: down 15m multiplier 1.5 max 1h
-     info: bloat size percentage in db $label:database table $label:table index $label:index
+     info: bloat size percentage in db ${label:database} table ${label:table} index ${label:index}
        to: dba
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index 785838d4..7f8ea279 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -24,7 +24,7 @@ component: File system
     every: 10s
      warn: $this > 0
     delay: down 1m multiplier 1.5 max 1h
-     info: ZFS pool $family state is degraded
+     info: ZFS pool ${label:pool} state is degraded
        to: sysadmin
 
  template: zfs_pool_state_crit
@@ -37,5 +37,5 @@ component: File system
     every: 10s
      crit: $this > 0
     delay: down 1m multiplier 1.5 max 1h
-     info: ZFS pool $family state is faulted or unavail
+     info: ZFS pool ${label:pool} state is faulted or unavail
        to: sysadmin
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-02-06 16:11:34 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-02-06 16:11:34 +0000
commit	d079b656b4719739b2247dcd9d46e9bec793095a (patch)
tree	d2c950c70a776bcf697c963151c5bd959f8a9f03 /health/health.d
parent	Releasing debian version 1.37.1-2. (diff)
download	netdata-d079b656b4719739b2247dcd9d46e9bec793095a.tar.xz netdata-d079b656b4719739b2247dcd9d46e9bec793095a.zip