diff options
Diffstat (limited to 'health/health.d/consul.conf')
-rw-r--r-- | health/health.d/consul.conf | 32 |
1 files changed, 22 insertions, 10 deletions
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf index 7edca656..8b414a26 100644 --- a/health/health.d/consul.conf +++ b/health/health.d/consul.conf @@ -10,6 +10,7 @@ component: Consul units: seconds warn: $this < 14*24*60*60 crit: $this < 7*24*60*60 + summary: Consul license expiration on ${label:node_name} info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} to: sysadmin @@ -23,7 +24,8 @@ component: Consul units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h - info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} + summary: Consul datacenter ${label:datacenter} health + info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} to: sysadmin template: consul_autopilot_server_health_status @@ -36,7 +38,8 @@ component: Consul units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h - info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy + summary: Consul server ${label:node_name} health + info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy to: sysadmin template: consul_raft_leader_last_contact_time @@ -50,7 +53,8 @@ component: Consul warn: $this > (($status >= $WARNING) ? (150) : (200)) crit: $this > (($status == $CRITICAL) ? (200) : (500)) delay: down 5m multiplier 1.5 max 1h - info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes + summary: Consul leader server ${label:node_name} last contact time + info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes to: sysadmin template: consul_raft_leadership_transitions @@ -63,7 +67,8 @@ component: Consul units: transitions warn: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader + summary: Consul server ${label:node_name} leadership transitions + info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader to: sysadmin template: consul_raft_thread_main_saturation @@ -76,7 +81,8 @@ component: Consul units: percentage warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: down 5m multiplier 1.5 max 1h - info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + summary: Consul server ${label:node_name} main Raft saturation + info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_raft_thread_fsm_saturation @@ -89,7 +95,8 @@ component: Consul units: milliseconds warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: down 5m multiplier 1.5 max 1h - info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + summary: Consul server ${label:node_name} FSM Raft saturation + info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_client_rpc_requests_exceeded @@ -102,7 +109,8 @@ component: Consul units: requests warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: down 5m multiplier 1.5 max 1h - info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + summary: Consul server ${label:node_name} RPC requests rate + info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_client_rpc_requests_failed @@ -115,6 +123,7 @@ component: Consul units: requests warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} failed RPC requests info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} to: sysadmin @@ -128,7 +137,8 @@ component: Consul units: status warn: $this != nan AND $this != 0 delay: down 5m multiplier 1.5 max 1h - info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + summary: Consul node health check ${label:check_name} on ${label:node_name} + info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_service_health_check_status @@ -141,7 +151,8 @@ component: Consul units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h - info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name} + info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_gc_pause_time @@ -155,5 +166,6 @@ component: Consul warn: $this > (($status >= $WARNING) ? (1) : (2)) crit: $this > (($status >= $WARNING) ? (2) : (5)) delay: down 5m multiplier 1.5 max 1h - info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} + summary: Consul server ${label:node_name} garbage collection pauses + info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin |