# you can disable an alarm notification by setting the 'to' line to: silent template: consul_license_expiration_time on: consul.license_expiration_time class: Errors type: ServiceMesh component: Consul calc: $license_expiration every: 60m units: seconds warn: $this < 14*24*60*60 crit: $this < 7*24*60*60 info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_autopilot_health_status on: consul.autopilot_health_status class: Errors type: ServiceMesh component: Consul calc: $unhealthy every: 10s units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} to: sysadmin template: consul_autopilot_server_health_status on: consul.autopilot_server_health_status class: Errors type: ServiceMesh component: Consul calc: $unhealthy every: 10s units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy to: sysadmin template: consul_raft_leader_last_contact_time on: consul.raft_leader_last_contact_time class: Errors type: ServiceMesh component: Consul lookup: average -1m unaligned of quantile_0.5 every: 10s units: milliseconds warn: $this > (($status >= $WARNING) ? (150) : (200)) crit: $this > (($status == $CRITICAL) ? (200) : (500)) delay: down 5m multiplier 1.5 max 1h info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes to: sysadmin template: consul_raft_leadership_transitions on: consul.raft_leadership_transitions_rate class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: transitions warn: $this > 0 delay: down 5m multiplier 1.5 max 1h info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader to: sysadmin template: consul_raft_thread_main_saturation on: consul.raft_thread_main_saturation_perc class: Utilization type: ServiceMesh component: Consul lookup: average -1m unaligned of quantile_0.9 every: 10s units: percentage warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: down 5m multiplier 1.5 max 1h info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_raft_thread_fsm_saturation on: consul.raft_thread_fsm_saturation_perc class: Utilization type: ServiceMesh component: Consul lookup: average -1m unaligned of quantile_0.9 every: 10s units: milliseconds warn: $this > (($status >= $WARNING) ? (40) : (50)) delay: down 5m multiplier 1.5 max 1h info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_client_rpc_requests_exceeded on: consul.client_rpc_requests_exceeded_rate class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: requests warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: down 5m multiplier 1.5 max 1h info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_client_rpc_requests_failed on: consul.client_rpc_requests_failed_rate class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: requests warn: $this > (($status >= $WARNING) ? (0) : (5)) delay: down 5m multiplier 1.5 max 1h info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_node_health_check_status on: consul.node_health_check_status class: Errors type: ServiceMesh component: Consul calc: $warning + $critical every: 10s units: status warn: $this != nan AND $this != 0 delay: down 5m multiplier 1.5 max 1h info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_service_health_check_status on: consul.service_health_check_status class: Errors type: ServiceMesh component: Consul calc: $warning + $critical every: 10s units: status warn: $this == 1 delay: down 5m multiplier 1.5 max 1h info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_gc_pause_time on: consul.gc_pause_time class: Errors type: ServiceMesh component: Consul lookup: sum -1m unaligned every: 10s units: seconds warn: $this > (($status >= $WARNING) ? (1) : (2)) crit: $this > (($status >= $WARNING) ? (2) : (5)) delay: down 5m multiplier 1.5 max 1h info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} to: sysadmin