summaryrefslogtreecommitdiffstats
path: root/health/health.d/consul.conf
blob: 8b414a26df4241c5fda65702a954840d99773c81 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# you can disable an alarm notification by setting the 'to' line to: silent

 template: consul_license_expiration_time
       on: consul.license_expiration_time
    class: Errors
     type: ServiceMesh
component: Consul
     calc: $license_expiration
    every: 60m
    units: seconds
     warn: $this < 14*24*60*60
     crit: $this < 7*24*60*60
  summary: Consul license expiration on ${label:node_name}
     info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_autopilot_health_status
       on: consul.autopilot_health_status
    class: Errors
     type: ServiceMesh
component: Consul
     calc: $unhealthy
    every: 10s
    units: status
     warn: $this == 1
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul datacenter ${label:datacenter} health
     info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
       to: sysadmin

 template: consul_autopilot_server_health_status
       on: consul.autopilot_server_health_status
    class: Errors
     type: ServiceMesh
component: Consul
     calc: $unhealthy
    every: 10s
    units: status
     warn: $this == 1
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} health
     info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
       to: sysadmin

 template: consul_raft_leader_last_contact_time
       on: consul.raft_leader_last_contact_time
    class: Errors
     type: ServiceMesh
component: Consul
   lookup: average -1m unaligned of quantile_0.5
    every: 10s
    units: milliseconds
     warn: $this > (($status >= $WARNING)  ? (150) : (200))
     crit: $this > (($status == $CRITICAL) ? (200) : (500))
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul leader server ${label:node_name} last contact time
     info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
       to: sysadmin

 template: consul_raft_leadership_transitions
       on: consul.raft_leadership_transitions_rate
    class: Errors
     type: ServiceMesh
component: Consul
   lookup: sum -1m unaligned
    every: 10s
    units: transitions
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} leadership transitions
     info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
       to: sysadmin

 template: consul_raft_thread_main_saturation
       on: consul.raft_thread_main_saturation_perc
    class: Utilization
     type: ServiceMesh
component: Consul
   lookup: average -1m unaligned of quantile_0.9
    every: 10s
    units: percentage
     warn: $this > (($status >= $WARNING)  ? (40) : (50))
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} main Raft saturation
     info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_raft_thread_fsm_saturation
       on: consul.raft_thread_fsm_saturation_perc
    class: Utilization
     type: ServiceMesh
component: Consul
   lookup: average -1m unaligned of quantile_0.9
    every: 10s
    units: milliseconds
     warn: $this > (($status >= $WARNING)  ? (40) : (50))
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} FSM Raft saturation
     info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_client_rpc_requests_exceeded
       on: consul.client_rpc_requests_exceeded_rate
    class: Errors
     type: ServiceMesh
component: Consul
   lookup: sum -1m unaligned
    every: 10s
    units: requests
     warn: $this > (($status >= $WARNING)  ? (0) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} RPC requests rate
     info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_client_rpc_requests_failed
       on: consul.client_rpc_requests_failed_rate
    class: Errors
     type: ServiceMesh
component: Consul
   lookup: sum -1m unaligned
    every: 10s
    units: requests
     warn: $this > (($status >= $WARNING)  ? (0) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} failed RPC requests
     info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_node_health_check_status
       on: consul.node_health_check_status
    class: Errors
     type: ServiceMesh
component: Consul
     calc: $warning + $critical
    every: 10s
    units: status
     warn: $this != nan AND $this != 0
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul node health check ${label:check_name} on ${label:node_name}
     info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_service_health_check_status
       on: consul.service_health_check_status
    class: Errors
     type: ServiceMesh
component: Consul
     calc: $warning + $critical
    every: 10s
    units: status
     warn: $this == 1
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name}
     info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin

 template: consul_gc_pause_time
       on: consul.gc_pause_time
    class: Errors
     type: ServiceMesh
component: Consul
   lookup: sum -1m unaligned
    every: 10s
    units: seconds
     warn: $this > (($status >= $WARNING)  ? (1) : (2))
     crit: $this > (($status >= $WARNING)  ? (2) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: Consul server ${label:node_name} garbage collection pauses
     info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
       to: sysadmin