diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/consul.conf | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf new file mode 100644 index 00000000..8b414a26 --- /dev/null +++ b/health/health.d/consul.conf @@ -0,0 +1,171 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: consul_license_expiration_time + on: consul.license_expiration_time + class: Errors + type: ServiceMesh +component: Consul + calc: $license_expiration + every: 60m + units: seconds + warn: $this < 14*24*60*60 + crit: $this < 7*24*60*60 + summary: Consul license expiration on ${label:node_name} + info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_autopilot_health_status + on: consul.autopilot_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Consul datacenter ${label:datacenter} health + info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} + to: sysadmin + + template: consul_autopilot_server_health_status + on: consul.autopilot_server_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} health + info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy + to: sysadmin + + template: consul_raft_leader_last_contact_time + on: consul.raft_leader_last_contact_time + class: Errors + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.5 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (150) : (200)) + crit: $this > (($status == $CRITICAL) ? (200) : (500)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul leader server ${label:node_name} last contact time + info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes + to: sysadmin + + template: consul_raft_leadership_transitions + on: consul.raft_leadership_transitions_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: transitions + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} leadership transitions + info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader + to: sysadmin + + template: consul_raft_thread_main_saturation + on: consul.raft_thread_main_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: percentage + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} main Raft saturation + info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_raft_thread_fsm_saturation + on: consul.raft_thread_fsm_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} FSM Raft saturation + info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_client_rpc_requests_exceeded + on: consul.client_rpc_requests_exceeded_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} RPC requests rate + info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_client_rpc_requests_failed + on: consul.client_rpc_requests_failed_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} failed RPC requests + info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_node_health_check_status + on: consul.node_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 1h + summary: Consul node health check ${label:check_name} on ${label:node_name} + info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_service_health_check_status + on: consul.service_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name} + info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_gc_pause_time + on: consul.gc_pause_time + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: seconds + warn: $this > (($status >= $WARNING) ? (1) : (2)) + crit: $this > (($status >= $WARNING) ? (2) : (5)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} garbage collection pauses + info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin |