mysql-test/suite/rpl/t/rpl_parallel_retry.test


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514

--source include/have_innodb.inc
--source include/have_perfschema.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--let $rpl_topology=1->2
--source include/rpl_init.inc

--echo *** Test retry of transactions that fail to replicate due to deadlock or similar temporary error. ***

--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1,1);
--save_master_pos

# Use a stored function to inject a debug_sync into the appropriate THD.
# The function does nothing on the master, and on the slave it injects the
# desired debug_sync action(s).
SET sql_log_bin=0;
--delimiter ||
CREATE FUNCTION foo(x INT, d1 VARCHAR(500), d2 VARCHAR(500))
  RETURNS INT DETERMINISTIC
  BEGIN
    RETURN x;
  END
||
--delimiter ;
SET sql_log_bin=1;

--connection server_2
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=5;
--source include/start_slave.inc
--sync_with_master
SET sql_log_bin=0;
--delimiter ||
CREATE FUNCTION foo(x INT, d1 VARCHAR(500), d2 VARCHAR(500))
  RETURNS INT DETERMINISTIC
  BEGIN
    IF d1 != '' THEN
      SET debug_sync = d1;
    END IF;
    IF d2 != '' THEN
      SET debug_sync = d2;
    END IF;
    RETURN x;
  END
||
--delimiter ;
SET sql_log_bin=1;
--source include/stop_slave.inc

--connection server_1
SET gtid_seq_no = 100;
BEGIN;
INSERT INTO t1 VALUES (2,1);
UPDATE t1 SET b=b+1 WHERE a=1;
INSERT INTO t1 VALUES (3,1);
COMMIT;
SELECT * FROM t1 ORDER BY a;
--save_master_pos

--connection server_2
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
let $ps_value= query_get_value(select last_trans_retry_count from
    performance_schema.replication_applier_status_by_worker where
    last_trans_retry_count > 0, last_trans_retry_count, 1);
let $assert_text= Performance Schema retries should match with actual retries;
let $assert_cond= "$ps_value" = $new_retry - $old_retry;
source include/assert.inc;


SELECT * FROM t1 ORDER BY a;


--echo *** Test that double retry works when the first retry also fails with temp error ***
--source include/stop_slave.inc

--connection server_1
SET gtid_seq_no = 100;
SET @old_server_id = @@server_id;
SET server_id = 10;
BEGIN;
INSERT INTO t1 VALUES (4,1);
UPDATE t1 SET b=b+1 WHERE a=1;
INSERT INTO t1 VALUES (5,1);
INSERT INTO t1 VALUES (6,1);
COMMIT;
SET server_id = @old_server_id;
SELECT * FROM t1 ORDER BY a;
--save_master_pos

--connection server_2
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parallel_simulate_double_temp_err_gtid_0_x_100";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
let $ps_value= query_get_value(select last_trans_retry_count from
    performance_schema.replication_applier_status_by_worker where
    last_trans_retry_count > 0, last_trans_retry_count, 1);
let $assert_text= Performance Schema retries should match with actual retries;
let $assert_cond= "$ps_value" = $new_retry - $old_retry;
source include/assert.inc;

SELECT * FROM t1 ORDER BY a;


--echo *** Test too many retries, eventually causing failure. ***
--source include/stop_slave.inc

--connection server_1
SET gtid_seq_no = 100;
SET @old_server_id = @@server_id;
SET server_id = 11;
BEGIN;
INSERT INTO t1 VALUES (7,1);
UPDATE t1 SET b=b+1 WHERE a=1;
INSERT INTO t1 VALUES (8,1);
INSERT INTO t1 VALUES (9,1);
COMMIT;
SET server_id = @old_server_id;
SELECT * FROM t1 ORDER BY a;
--save_master_pos

--connection server_2
SET sql_log_bin=0;
CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
SET sql_log_bin=1;

SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
START SLAVE;
--let $slave_sql_errno= 1213
--source include/wait_for_slave_sql_error.inc
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--disable_query_log
eval SELECT $new_retry - $old_retry AS retries;
--enable_query_log
let $ps_value= query_get_value(select last_trans_retry_count from
    performance_schema.replication_applier_status_by_worker where
    last_trans_retry_count > 0, last_trans_retry_count, 1);
let $assert_text= Performance Schema retries should match with actual retries;
let $assert_cond= "$ps_value" = $new_retry - $old_retry;
source include/assert.inc;

SELECT * FROM t1 ORDER BY a;
STOP SLAVE IO_THREAD;
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 ORDER BY a;

--echo *** Test retry of event group that spans multiple relay log files. ***

--connection server_1
CREATE TABLE t2 (a int PRIMARY KEY, b BLOB) ENGINE=InnoDB;
INSERT INTO t2 VALUES (1,"Hulubullu");
--save_master_pos

--connection server_2
--sync_with_master
--source include/stop_slave.inc
SET @old_max= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size=4096;

--connection server_1
--let $big= `SELECT REPEAT("*", 5000)`
SET gtid_seq_no = 100;
SET @old_server_id = @@server_id;
SET server_id = 12;
BEGIN;
--disable_query_log
eval INSERT INTO t2 VALUES (2, CONCAT("Hello ", "$big"));
eval INSERT INTO t2 VALUES (3, CONCAT("Long data:  ", "$big"));
--enable_query_log
INSERT INTO t1 VALUES (10, 4);
COMMIT;
SET server_id = @old_server_id;
SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
SELECT a, LENGTH(b) FROM t2 ORDER BY a;
--save_master_pos

--connection server_2
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
let $ps_value= query_get_value(select last_trans_retry_count from
    performance_schema.replication_applier_status_by_worker where
    last_trans_retry_count > 0, last_trans_retry_count, 1);
let $assert_text= Performance Schema retries should match with actual retries;
let $assert_cond= "$ps_value" = $new_retry - $old_retry;
source include/assert.inc;

SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
SELECT a, LENGTH(b) FROM t2 ORDER BY a;

--connection server_1
INSERT INTO t1 VALUES (11,11);
--disable_query_log
eval INSERT INTO t2 VALUES (4, "$big");
--enable_query_log
--save_master_pos

--connection server_2
--sync_with_master
SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
SELECT a, LENGTH(b) FROM t2 ORDER BY a;
SET GLOBAL max_relay_log_size=@old_max;


--echo *** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***

--connection server_2
--source include/stop_slave.inc

--connection server_1
BEGIN;
INSERT INTO t1 VALUES (100, 0);
INSERT INTO t1 VALUES (101, 0);
INSERT INTO t1 VALUES (102, 0);
INSERT INTO t1 VALUES (103, 0);
COMMIT;
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
--save_master_pos

--connection server_2
# Inject a DBUG error insert to cause the XID event of the single transaction
# from the master to fail with a deadlock error and be retried.
# The bug was that the retry of the XID would leave the relay log position
# incorrect (off by the size of XID event).
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
let $ps_value= query_get_value(select last_trans_retry_count from
    performance_schema.replication_applier_status_by_worker where
    last_trans_retry_count > 0, last_trans_retry_count, 1);
let $assert_text= Performance Schema retries should match with actual retries;
let $assert_cond= "$ps_value" = $new_retry - $old_retry;
source include/assert.inc;

SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
# Stop the SQL thread. When the bug was there to give the incorrect relay log
# position, the restart of the SQL thread would read garbage data from the
# middle of an event and fail with relay log IO error.
--source include/stop_slave_sql.inc

--connection server_1
INSERT INTO t1 VALUES (104, 1);
INSERT INTO t1 VALUES (105, 1);
INSERT INTO t1 VALUES (106, 1);
INSERT INTO t1 VALUES (107, 1);
INSERT INTO t1 VALUES (108, 1);
INSERT INTO t1 VALUES (109, 1);
--save_master_pos

--connection server_2
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;


--echo *** MDEV-6917: Parallel replication: "Commit failed due to failure of an earlier commit on which this one depends", but no prior failure seen **

--connection server_1
CREATE TABLE t3 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
INSERT INTO t3 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
CREATE TABLE t4 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
# MDEV-515 takes X-lock on the table for the first insert.
# So concurrent insert won't happen on the table
INSERT INTO t4 VALUES(100, 100);

# We need statement binlog format to be able to inject debug_sync statements
# on the slave with calls to foo().
SET @old_format= @@SESSION.binlog_format;
SET binlog_format='statement';
--save_master_pos

--connection server_2
--sync_with_master
--source include/stop_slave.inc
CHANGE MASTER TO master_use_gtid=no;

--connection server_1

# Create a group commit with three transactions T1, T2, T3.
# T2 will block T1 on the slave where we will make it run first, so it will be
# deadlock killed.
# The bug was that in this case, T3 was signalled to fail due to T2 failing,
# even though the retry of T2 was later successful.

--connect (con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
SET @old_format= @@SESSION.binlog_format;
SET binlog_format='statement';
BEGIN;
INSERT INTO t4 VALUES (10, foo(1, 'before_execute_sql_command WAIT_FOR t1_start', ''));
UPDATE t3 SET b=NULL WHERE a=6;
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
send COMMIT;
--connection server_1
SET debug_sync='now WAIT_FOR master_queued1';

--connect (con2,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
SET @old_format= @@SESSION.binlog_format;
SET binlog_format='statement';
BEGIN;
INSERT INTO t4 VALUES (20, foo(2, 'group_commit_waiting_for_prior SIGNAL t2_waiting', ''));
DELETE FROM t3 WHERE b <= 3;
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
send COMMIT;

--connection server_1
SET debug_sync='now WAIT_FOR master_queued2';

--connect (con3,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
SET @old_format= @@SESSION.binlog_format;
SET binlog_format='statement';
BEGIN;
INSERT INTO t4 VALUES (30, foo(3, 'before_execute_sql_command WAIT_FOR t3_start', 'group_commit_waiting_for_prior SIGNAL t3_waiting'));
INSERT INTO t3 VALUES (7,7);
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued3';
send COMMIT;

--connection server_1
SET debug_sync='now WAIT_FOR master_queued3';
SET debug_sync='now SIGNAL master_cont1';

--connection con1
REAP;
SET binlog_format=@old_format;
--connection con2
REAP;
SET binlog_format=@old_format;
--connection con3
REAP;
SET debug_sync='RESET';
SET binlog_format=@old_format;

--connection server_1
--save_master_pos
SELECT * FROM t3 ORDER BY a;


--connection server_2
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
SET @old_dbug=@@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,thd_need_ordering_with_force";
--source include/start_slave.inc
# First, wait for T2 to complete up to where it is waiting for T1 to group
# commit for both of them. This will set locks that will block T1, causing
# a deadlock kill and retry of T2. T1 and T3 are still blocked at the start
# of each their SQL statements.
SET debug_sync='now WAIT_FOR t2_waiting';
# Now let T3 move on until the point where it is itself ready to commit.
SET debug_sync='now SIGNAL t3_start';
SET debug_sync='now WAIT_FOR t3_waiting';
# Now T2 and T3 are set up, so we can let T1 proceed.
SET debug_sync='now SIGNAL t1_start';
# Now we can wait for the slave to catch up.
# We should see T2 being deadlock killed and retried.
# The bug was that T2 deadlock kill would cause T3 to fail due to failure
# of an earlier commit. This is wrong as T2 did not fail, it was only
# retried.
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
SET debug_sync='RESET';
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
let $ps_value= query_get_value(select last_trans_retry_count from
    performance_schema.replication_applier_status_by_worker where
    last_trans_retry_count > 0, last_trans_retry_count, 1);
let $assert_text= Performance Schema retries should match with actual retries;
let $assert_cond= "$ps_value" = $new_retry - $old_retry;
source include/assert.inc;
SELECT * FROM t3 ORDER BY a;


--connection server_1
SET binlog_format=@old_format;


# Clean up of the above part.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
--source include/start_slave.inc

--connection server_1
DROP TABLE t1, t2, t3, t4;
DROP function foo;

--sync_slave_with_master server_2

#
# MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing out of order at retry
#

--connection server_1
CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
# MDEV-515 takes X-lock on the table for the first insert.
# So concurrent insert won't happen on the table
INSERT INTO t1 VALUES(100, 100);

# Replicate create-t1 and prepare to re-start slave in optimistic mode
--sync_slave_with_master server_2
--source include/stop_slave.inc
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @@GLOBAL.slave_parallel_threads=5;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET @@GLOBAL.slave_parallel_mode='aggressive';
SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
SET @@GLOBAL.innodb_lock_wait_timeout=2;
SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
SET @@GLOBAL.slave_transaction_retries=1;

--echo # Spoilers on the slave side causing temporary errors
--connect (spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT)
BEGIN;
  INSERT INTO t1 SET a=1,b=2;

--connect (spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT)
BEGIN;
  INSERT INTO t1 SET a=2,b=2;

--echo # Master payload
--connection server_1
SET @@SESSION.GTID_SEQ_NO=1000;
INSERT INTO t1 SET a=1,b=1;
SET @@SESSION.GTID_SEQ_NO=1001;
INSERT INTO t1 SET a=2,b=1;

--echo # Start slave whose both appliers is destined to being blocked
--connection server_2
SET @old_dbug= @@GLOBAL.debug_dbug;
SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
--source include/start_slave.inc

--echo # Make sure both workers are waiting at their sync points
--let $wait_condition= SELECT count(*)=2 FROM information_schema.processlist WHERE state LIKE '%debug sync point%';
--source include/wait_condition.inc


--echo # Signal to the 1st to proceed after it has reached termination state
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
--connection spoiler_21
ROLLBACK;

--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE '%debug sync point%';
--source include/wait_condition.inc

--echo # Release the 2nd worker to proceed
--connection spoiler_22
ROLLBACK;
--connection server_2
SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';

--echo # observe how it all ends
if (`SELECT count(*) = 1 FROM t1 WHERE a = 1`)
{
  --echo "*** Unexpected commit by the first Worker ***"
  SELECT * from t1;
  --die
}

--echo # Wait for the workers to go home and check the result of applying
--let $wait_condition=SELECT count(*) = 0 FROM information_schema.processlist WHERE command = 'Slave_worker'
--source include/wait_condition.inc
if (`SELECT count(*) = 1 FROM t1 WHERE a = 2`)
{
  --echo
  --echo "*** Error: congrats, you hit MDEV-12746 issue. ***"
  --echo
  --die
}
--echo # which is OK

#
# Clean up
#
--connection server_2
--source include/stop_slave.inc
SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
SET @@GLOBAL.debug_dbug=@old_dbug;
SET debug_sync='RESET';
--source include/start_slave.inc

--connection server_1
DROP TABLE t1;

--sync_slave_with_master server_2

--source include/rpl_end.inc