blob: dff6d2df35ed77733b6590bcd711c8941d006ea1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
# you can disable an alarm notification by setting the 'to' line to: silent
template: consul_license_expiration_time
on: consul.license_expiration_time
class: Errors
type: ServiceMesh
component: Consul
calc: $license_expiration
every: 60m
units: seconds
warn: $this < 14*24*60*60
crit: $this < 7*24*60*60
info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_autopilot_health_status
on: consul.autopilot_health_status
class: Errors
type: ServiceMesh
component: Consul
calc: $unhealthy
every: 10s
units: status
warn: $this == 1
delay: down 5m multiplier 1.5 max 1h
info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
to: sysadmin
template: consul_autopilot_server_health_status
on: consul.autopilot_server_health_status
class: Errors
type: ServiceMesh
component: Consul
calc: $unhealthy
every: 10s
units: status
warn: $this == 1
delay: down 5m multiplier 1.5 max 1h
info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
to: sysadmin
template: consul_raft_leader_last_contact_time
on: consul.raft_leader_last_contact_time
class: Errors
type: ServiceMesh
component: Consul
lookup: average -1m unaligned of quantile_0.5
every: 10s
units: milliseconds
warn: $this > (($status >= $WARNING) ? (150) : (200))
crit: $this > (($status == $CRITICAL) ? (200) : (500))
delay: down 5m multiplier 1.5 max 1h
info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
to: sysadmin
template: consul_raft_leadership_transitions
on: consul.raft_leadership_transitions_rate
class: Errors
type: ServiceMesh
component: Consul
lookup: sum -1m unaligned
every: 10s
units: transitions
warn: $this > 0
delay: down 5m multiplier 1.5 max 1h
info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
to: sysadmin
template: consul_raft_thread_main_saturation
on: consul.raft_thread_main_saturation_perc
class: Utilization
type: ServiceMesh
component: Consul
lookup: average -1m unaligned of quantile_0.9
every: 10s
units: percentage
warn: $this > (($status >= $WARNING) ? (40) : (50))
delay: down 5m multiplier 1.5 max 1h
info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_raft_thread_fsm_saturation
on: consul.raft_thread_fsm_saturation_perc
class: Utilization
type: ServiceMesh
component: Consul
lookup: average -1m unaligned of quantile_0.9
every: 10s
units: milliseconds
warn: $this > (($status >= $WARNING) ? (40) : (50))
delay: down 5m multiplier 1.5 max 1h
info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_client_rpc_requests_exceeded
on: consul.client_rpc_requests_exceeded_rate
class: Errors
type: ServiceMesh
component: Consul
lookup: sum -1m unaligned
every: 10s
units: requests
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_client_rpc_requests_failed
on: consul.client_rpc_requests_failed_rate
class: Errors
type: ServiceMesh
component: Consul
lookup: sum -1m unaligned
every: 10s
units: requests
warn: $this > (($status >= $WARNING) ? (0) : (5))
delay: down 5m multiplier 1.5 max 1h
info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_node_health_check_status
on: consul.node_health_check_status
class: Errors
type: ServiceMesh
component: Consul
calc: $warning + $critical
every: 10s
units: status
warn: $this != nan AND $this != 0
delay: down 5m multiplier 1.5 max 1h
info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_service_health_check_status
on: consul.service_health_check_status
class: Errors
type: ServiceMesh
component: Consul
calc: $warning + $critical
every: 10s
units: status
warn: $this == 1
delay: down 5m multiplier 1.5 max 1h
info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_gc_pause_time
on: consul.gc_pause_time
class: Errors
type: ServiceMesh
component: Consul
lookup: sum -1m unaligned
every: 10s
units: seconds
warn: $this > (($status >= $WARNING) ? (1) : (2))
crit: $this > (($status >= $WARNING) ? (2) : (5))
delay: down 5m multiplier 1.5 max 1h
info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
|