summaryrefslogtreecommitdiffstats
path: root/health/health.d/consul.conf
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d/consul.conf')
-rw-r--r--health/health.d/consul.conf159
1 files changed, 159 insertions, 0 deletions
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
new file mode 100644
index 000000000..dff6d2df3
--- /dev/null
+++ b/health/health.d/consul.conf
@@ -0,0 +1,159 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: consul_license_expiration_time
+ on: consul.license_expiration_time
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $license_expiration
+ every: 60m
+ units: seconds
+ warn: $this < 14*24*60*60
+ crit: $this < 7*24*60*60
+ info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_autopilot_health_status
+ on: consul.autopilot_health_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $unhealthy
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
+ to: sysadmin
+
+ template: consul_autopilot_server_health_status
+ on: consul.autopilot_server_health_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $unhealthy
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
+ to: sysadmin
+
+ template: consul_raft_leader_last_contact_time
+ on: consul.raft_leader_last_contact_time
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: average -1m unaligned of quantile_0.5
+ every: 10s
+ units: milliseconds
+ warn: $this > (($status >= $WARNING) ? (150) : (200))
+ crit: $this > (($status == $CRITICAL) ? (200) : (500))
+ delay: down 5m multiplier 1.5 max 1h
+ info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
+ to: sysadmin
+
+ template: consul_raft_leadership_transitions
+ on: consul.raft_leadership_transitions_rate
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: transitions
+ warn: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
+ to: sysadmin
+
+ template: consul_raft_thread_main_saturation
+ on: consul.raft_thread_main_saturation_perc
+ class: Utilization
+ type: ServiceMesh
+component: Consul
+ lookup: average -1m unaligned of quantile_0.9
+ every: 10s
+ units: percentage
+ warn: $this > (($status >= $WARNING) ? (40) : (50))
+ delay: down 5m multiplier 1.5 max 1h
+ info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_raft_thread_fsm_saturation
+ on: consul.raft_thread_fsm_saturation_perc
+ class: Utilization
+ type: ServiceMesh
+component: Consul
+ lookup: average -1m unaligned of quantile_0.9
+ every: 10s
+ units: milliseconds
+ warn: $this > (($status >= $WARNING) ? (40) : (50))
+ delay: down 5m multiplier 1.5 max 1h
+ info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_client_rpc_requests_exceeded
+ on: consul.client_rpc_requests_exceeded_rate
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: requests
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_client_rpc_requests_failed
+ on: consul.client_rpc_requests_failed_rate
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: requests
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_node_health_check_status
+ on: consul.node_health_check_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $warning + $critical
+ every: 10s
+ units: status
+ warn: $this != nan AND $this != 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_service_health_check_status
+ on: consul.service_health_check_status
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ calc: $warning + $critical
+ every: 10s
+ units: status
+ warn: $this == 1
+ delay: down 5m multiplier 1.5 max 1h
+ info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin
+
+ template: consul_gc_pause_time
+ on: consul.gc_pause_time
+ class: Errors
+ type: ServiceMesh
+component: Consul
+ lookup: sum -1m unaligned
+ every: 10s
+ units: seconds
+ warn: $this > (($status >= $WARNING) ? (1) : (2))
+ crit: $this > (($status >= $WARNING) ? (2) : (5))
+ delay: down 5m multiplier 1.5 max 1h
+ info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
+ to: sysadmin