summaryrefslogtreecommitdiffstats
path: root/health/health.d/consul.conf
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:54:23 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:54:44 +0000
commit836b47cb7e99a977c5a23b059ca1d0b5065d310e (patch)
tree1604da8f482d02effa033c94a84be42bc0c848c3 /health/health.d/consul.conf
parentReleasing debian version 1.44.3-2. (diff)
downloadnetdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.tar.xz
netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.zip
Merging upstream version 1.46.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d/consul.conf')
-rw-r--r--health/health.d/consul.conf171
1 files changed, 0 insertions, 171 deletions
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
deleted file mode 100644
index 8b414a26d..000000000
--- a/health/health.d/consul.conf
+++ /dev/null
@@ -1,171 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: consul_license_expiration_time
- on: consul.license_expiration_time
- class: Errors
- type: ServiceMesh
-component: Consul
- calc: $license_expiration
- every: 60m
- units: seconds
- warn: $this < 14*24*60*60
- crit: $this < 7*24*60*60
- summary: Consul license expiration on ${label:node_name}
- info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_autopilot_health_status
- on: consul.autopilot_health_status
- class: Errors
- type: ServiceMesh
-component: Consul
- calc: $unhealthy
- every: 10s
- units: status
- warn: $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul datacenter ${label:datacenter} health
- info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
- to: sysadmin
-
- template: consul_autopilot_server_health_status
- on: consul.autopilot_server_health_status
- class: Errors
- type: ServiceMesh
-component: Consul
- calc: $unhealthy
- every: 10s
- units: status
- warn: $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} health
- info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
- to: sysadmin
-
- template: consul_raft_leader_last_contact_time
- on: consul.raft_leader_last_contact_time
- class: Errors
- type: ServiceMesh
-component: Consul
- lookup: average -1m unaligned of quantile_0.5
- every: 10s
- units: milliseconds
- warn: $this > (($status >= $WARNING) ? (150) : (200))
- crit: $this > (($status == $CRITICAL) ? (200) : (500))
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul leader server ${label:node_name} last contact time
- info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
- to: sysadmin
-
- template: consul_raft_leadership_transitions
- on: consul.raft_leadership_transitions_rate
- class: Errors
- type: ServiceMesh
-component: Consul
- lookup: sum -1m unaligned
- every: 10s
- units: transitions
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} leadership transitions
- info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
- to: sysadmin
-
- template: consul_raft_thread_main_saturation
- on: consul.raft_thread_main_saturation_perc
- class: Utilization
- type: ServiceMesh
-component: Consul
- lookup: average -1m unaligned of quantile_0.9
- every: 10s
- units: percentage
- warn: $this > (($status >= $WARNING) ? (40) : (50))
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} main Raft saturation
- info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_raft_thread_fsm_saturation
- on: consul.raft_thread_fsm_saturation_perc
- class: Utilization
- type: ServiceMesh
-component: Consul
- lookup: average -1m unaligned of quantile_0.9
- every: 10s
- units: milliseconds
- warn: $this > (($status >= $WARNING) ? (40) : (50))
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} FSM Raft saturation
- info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_client_rpc_requests_exceeded
- on: consul.client_rpc_requests_exceeded_rate
- class: Errors
- type: ServiceMesh
-component: Consul
- lookup: sum -1m unaligned
- every: 10s
- units: requests
- warn: $this > (($status >= $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} RPC requests rate
- info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_client_rpc_requests_failed
- on: consul.client_rpc_requests_failed_rate
- class: Errors
- type: ServiceMesh
-component: Consul
- lookup: sum -1m unaligned
- every: 10s
- units: requests
- warn: $this > (($status >= $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} failed RPC requests
- info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_node_health_check_status
- on: consul.node_health_check_status
- class: Errors
- type: ServiceMesh
-component: Consul
- calc: $warning + $critical
- every: 10s
- units: status
- warn: $this != nan AND $this != 0
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul node health check ${label:check_name} on ${label:node_name}
- info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_service_health_check_status
- on: consul.service_health_check_status
- class: Errors
- type: ServiceMesh
-component: Consul
- calc: $warning + $critical
- every: 10s
- units: status
- warn: $this == 1
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name}
- info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin
-
- template: consul_gc_pause_time
- on: consul.gc_pause_time
- class: Errors
- type: ServiceMesh
-component: Consul
- lookup: sum -1m unaligned
- every: 10s
- units: seconds
- warn: $this > (($status >= $WARNING) ? (1) : (2))
- crit: $this > (($status >= $WARNING) ? (2) : (5))
- delay: down 5m multiplier 1.5 max 1h
- summary: Consul server ${label:node_name} garbage collection pauses
- info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
- to: sysadmin