diff options
Diffstat (limited to 'health/health.d/consul.conf')
-rw-r--r-- | health/health.d/consul.conf | 171 |
1 files changed, 0 insertions, 171 deletions
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf deleted file mode 100644 index 8b414a26d..000000000 --- a/health/health.d/consul.conf +++ /dev/null @@ -1,171 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: consul_license_expiration_time - on: consul.license_expiration_time - class: Errors - type: ServiceMesh -component: Consul - calc: $license_expiration - every: 60m - units: seconds - warn: $this < 14*24*60*60 - crit: $this < 7*24*60*60 - summary: Consul license expiration on ${label:node_name} - info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_autopilot_health_status - on: consul.autopilot_health_status - class: Errors - type: ServiceMesh -component: Consul - calc: $unhealthy - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Consul datacenter ${label:datacenter} health - info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} - to: sysadmin - - template: consul_autopilot_server_health_status - on: consul.autopilot_server_health_status - class: Errors - type: ServiceMesh -component: Consul - calc: $unhealthy - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} health - info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy - to: sysadmin - - template: consul_raft_leader_last_contact_time - on: consul.raft_leader_last_contact_time - class: Errors - type: ServiceMesh -component: Consul - lookup: average -1m unaligned of quantile_0.5 - every: 10s - units: milliseconds - warn: $this > (($status >= $WARNING) ? (150) : (200)) - crit: $this > (($status == $CRITICAL) ? (200) : (500)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul leader server ${label:node_name} last contact time - info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes - to: sysadmin - - template: consul_raft_leadership_transitions - on: consul.raft_leadership_transitions_rate - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: transitions - warn: $this > 0 - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} leadership transitions - info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader - to: sysadmin - - template: consul_raft_thread_main_saturation - on: consul.raft_thread_main_saturation_perc - class: Utilization - type: ServiceMesh -component: Consul - lookup: average -1m unaligned of quantile_0.9 - every: 10s - units: percentage - warn: $this > (($status >= $WARNING) ? (40) : (50)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} main Raft saturation - info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_raft_thread_fsm_saturation - on: consul.raft_thread_fsm_saturation_perc - class: Utilization - type: ServiceMesh -component: Consul - lookup: average -1m unaligned of quantile_0.9 - every: 10s - units: milliseconds - warn: $this > (($status >= $WARNING) ? (40) : (50)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} FSM Raft saturation - info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_client_rpc_requests_exceeded - on: consul.client_rpc_requests_exceeded_rate - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: requests - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} RPC requests rate - info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_client_rpc_requests_failed - on: consul.client_rpc_requests_failed_rate - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: requests - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} failed RPC requests - info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_node_health_check_status - on: consul.node_health_check_status - class: Errors - type: ServiceMesh -component: Consul - calc: $warning + $critical - every: 10s - units: status - warn: $this != nan AND $this != 0 - delay: down 5m multiplier 1.5 max 1h - summary: Consul node health check ${label:check_name} on ${label:node_name} - info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_service_health_check_status - on: consul.service_health_check_status - class: Errors - type: ServiceMesh -component: Consul - calc: $warning + $critical - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name} - info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_gc_pause_time - on: consul.gc_pause_time - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: seconds - warn: $this > (($status >= $WARNING) ? (1) : (2)) - crit: $this > (($status >= $WARNING) ? (2) : (5)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} garbage collection pauses - info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin |