summaryrefslogtreecommitdiffstats
path: root/health/health.d/mysql.conf
blob: 91860c4a7748b7b3f6bf9061c82fda890ee9b35d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# make sure mysql is running

 template: mysql_last_collected_secs
       on: mysql.queries
    class: Database
component: MySQL
     type: Latency
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba


# -----------------------------------------------------------------------------
# slow queries

 template: mysql_10s_slow_queries
       on: mysql.queries
    class: Database
component: MySQL
     type: Latency
   lookup: sum -10s of slow_queries
    units: slow queries
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (10) : (20))
    delay: down 5m multiplier 1.5 max 1h
     info: number of slow queries in the last 10 seconds
       to: dba


# -----------------------------------------------------------------------------
# lock waits

 template: mysql_10s_table_locks_immediate
       on: mysql.table_locks
    class: Database
component: MySQL
     type: Utilization
   lookup: sum -10s absolute of immediate
    units: immediate locks
    every: 10s
     info: number of table immediate locks in the last 10 seconds
       to: dba

 template: mysql_10s_table_locks_waited
       on: mysql.table_locks
    class: Database
component: MySQL
     type: Latency
   lookup: sum -10s absolute of waited
    units: waited locks
    every: 10s
     info: number of table waited locks in the last 10 seconds
       to: dba

 template: mysql_10s_waited_locks_ratio
       on: mysql.table_locks
    class: Database
component: MySQL
     type: Latency
     calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (10) : (25))
     crit: $this > (($status == $CRITICAL) ? (25) : (50))
    delay: down 30m multiplier 1.5 max 1h
     info: ratio of waited table locks over the last 10 seconds
       to: dba


# -----------------------------------------------------------------------------
# connections

 template: mysql_connections
       on: mysql.connections_active
    class: Database
component: MySQL
     type: Utilization
     calc: $active * 100 / $limit
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (60) : (70))
     crit: $this > (($status == $CRITICAL) ? (80) : (90))
    delay: down 15m multiplier 1.5 max 1h
     info: client connections utilization
       to: dba


# -----------------------------------------------------------------------------
# replication

 template: mysql_replication
       on: mysql.slave_status
    class: Database
component: MySQL
     type: Errors
     calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
    units: ok/failed
    every: 10s
     crit: $this == 0
    delay: down 5m multiplier 1.5 max 1h
     info: replication status (0: stopped, 1: working)
       to: dba

 template: mysql_replication_lag
       on: mysql.slave_behind
    class: Database
component: MySQL
     type: Errors
     calc: $seconds
    units: seconds
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (10) : (30))
    delay: down 15m multiplier 1.5 max 1h
     info: difference between the timestamp of the latest transaction processed by the SQL thread and \
           the timestamp of the same transaction when it was processed on the master
       to: dba


# -----------------------------------------------------------------------------
# galera cluster size

 template: mysql_galera_cluster_size_max_2m
       on: mysql.galera_cluster_size
    class: Database
component: MySQL
     type: Utilization
   lookup: max -2m absolute
    units: nodes
    every: 10s
     info: maximum galera cluster size in the last 2 minutes
       to: dba

 template: mysql_galera_cluster_size
       on: mysql.galera_cluster_size
    class: Database
component: MySQL
     type: Utilization
     calc: $nodes
    units: nodes
    every: 10s
     warn: $this > $mysql_galera_cluster_size_max_2m
     crit: $this < $mysql_galera_cluster_size_max_2m
    delay: up 20s down 5m multiplier 1.5 max 1h
     info: current galera cluster size, compared to the maximum size in the last 2 minutes
       to: dba

# galera node state

 template: mysql_galera_cluster_state
       on: mysql.galera_cluster_state
    class: Database
component: MySQL
     type: Errors
     calc: $state
    every: 10s
     warn: $this == 2 OR $this == 3
     crit: $this == 0 OR $this == 1 OR $this >= 5
    delay: up 30s down 5m multiplier 1.5 max 1h
     info: galera node state \
           (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
       to: dba


# galera node status

 template: mysql_galera_cluster_status
       on: mysql.galera_cluster_status
    class: Database
component: MySQL
     type: Errors
     calc: $wsrep_cluster_status
    every: 10s
     crit: $mysql_galera_cluster_state != nan AND $this != 0
    delay: up 30s down 5m multiplier 1.5 max 1h
     info: galera node cluster component status \
           (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
           Any other value than primary indicates that the node is part of a nonoperational component.
       to: dba