1 files changed, 514 insertions, 0 deletions
diff --git a/mysql-test/suite/rpl/t/rpl_parallel_retry.test b/mysql-test/suite/rpl/t/rpl_parallel_retry.test
new file mode 100644
index 00000000..8b2affed
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_parallel_retry.test
@@ -0,0 +1,514 @@
+--source include/have_innodb.inc
+--source include/have_perfschema.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--let $rpl_topology=1->2
+--source include/rpl_init.inc
+
+--echo *** Test retry of transactions that fail to replicate due to deadlock or similar temporary error. ***
+
+--connection server_1
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+INSERT INTO t1 VALUES (1,1);
+--save_master_pos
+
+# Use a stored function to inject a debug_sync into the appropriate THD.
+# The function does nothing on the master, and on the slave it injects the
+# desired debug_sync action(s).
+SET sql_log_bin=0;
+--delimiter ||
+CREATE FUNCTION foo(x INT, d1 VARCHAR(500), d2 VARCHAR(500))
+  RETURNS INT DETERMINISTIC
+  BEGIN
+    RETURN x;
+  END
+||
+--delimiter ;
+SET sql_log_bin=1;
+
+--connection server_2
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=5;
+--source include/start_slave.inc
+--sync_with_master
+SET sql_log_bin=0;
+--delimiter ||
+CREATE FUNCTION foo(x INT, d1 VARCHAR(500), d2 VARCHAR(500))
+  RETURNS INT DETERMINISTIC
+  BEGIN
+    IF d1 != '' THEN
+      SET debug_sync = d1;
+    END IF;
+    IF d2 != '' THEN
+      SET debug_sync = d2;
+    END IF;
+    RETURN x;
+  END
+||
+--delimiter ;
+SET sql_log_bin=1;
+--source include/stop_slave.inc
+
+--connection server_1
+SET gtid_seq_no = 100;
+BEGIN;
+INSERT INTO t1 VALUES (2,1);
+UPDATE t1 SET b=b+1 WHERE a=1;
+INSERT INTO t1 VALUES (3,1);
+COMMIT;
+SELECT * FROM t1 ORDER BY a;
+--save_master_pos
+
+--connection server_2
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100";
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select last_trans_retry_count from
+    performance_schema.replication_applier_status_by_worker where
+    last_trans_retry_count > 0, last_trans_retry_count, 1);
+let $assert_text= Performance Schema retries should match with actual retries;
+let $assert_cond= "$ps_value" = $new_retry - $old_retry;
+source include/assert.inc;
+
+
+SELECT * FROM t1 ORDER BY a;
+
+
+--echo *** Test that double retry works when the first retry also fails with temp error ***
+--source include/stop_slave.inc
+
+--connection server_1
+SET gtid_seq_no = 100;
+SET @old_server_id = @@server_id;
+SET server_id = 10;
+BEGIN;
+INSERT INTO t1 VALUES (4,1);
+UPDATE t1 SET b=b+1 WHERE a=1;
+INSERT INTO t1 VALUES (5,1);
+INSERT INTO t1 VALUES (6,1);
+COMMIT;
+SET server_id = @old_server_id;
+SELECT * FROM t1 ORDER BY a;
+--save_master_pos
+
+--connection server_2
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parallel_simulate_double_temp_err_gtid_0_x_100";
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select last_trans_retry_count from
+    performance_schema.replication_applier_status_by_worker where
+    last_trans_retry_count > 0, last_trans_retry_count, 1);
+let $assert_text= Performance Schema retries should match with actual retries;
+let $assert_cond= "$ps_value" = $new_retry - $old_retry;
+source include/assert.inc;
+
+SELECT * FROM t1 ORDER BY a;
+
+
+--echo *** Test too many retries, eventually causing failure. ***
+--source include/stop_slave.inc
+
+--connection server_1
+SET gtid_seq_no = 100;
+SET @old_server_id = @@server_id;
+SET server_id = 11;
+BEGIN;
+INSERT INTO t1 VALUES (7,1);
+UPDATE t1 SET b=b+1 WHERE a=1;
+INSERT INTO t1 VALUES (8,1);
+INSERT INTO t1 VALUES (9,1);
+COMMIT;
+SET server_id = @old_server_id;
+SELECT * FROM t1 ORDER BY a;
+--save_master_pos
+
+--connection server_2
+SET sql_log_bin=0;
+CALL mtr.add_suppression("Slave worker thread retried transaction 10 time\\(s\\) in vain, giving up");
+CALL mtr.add_suppression("Slave: Deadlock found when trying to get lock; try restarting transaction");
+CALL mtr.add_suppression("Slave worker thread retried transaction .* in vain, giving up");
+SET sql_log_bin=1;
+
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100";
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+START SLAVE;
+--let $slave_sql_errno= 1213
+--source include/wait_for_slave_sql_error.inc
+SET GLOBAL debug_dbug=@old_dbug;
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--disable_query_log
+eval SELECT $new_retry - $old_retry AS retries;
+--enable_query_log
+let $ps_value= query_get_value(select last_trans_retry_count from
+    performance_schema.replication_applier_status_by_worker where
+    last_trans_retry_count > 0, last_trans_retry_count, 1);
+let $assert_text= Performance Schema retries should match with actual retries;
+let $assert_cond= "$ps_value" = $new_retry - $old_retry;
+source include/assert.inc;
+
+SELECT * FROM t1 ORDER BY a;
+STOP SLAVE IO_THREAD;
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 ORDER BY a;
+
+--echo *** Test retry of event group that spans multiple relay log files. ***
+
+--connection server_1
+CREATE TABLE t2 (a int PRIMARY KEY, b BLOB) ENGINE=InnoDB;
+INSERT INTO t2 VALUES (1,"Hulubullu");
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+SET @old_max= @@GLOBAL.max_relay_log_size;
+SET GLOBAL max_relay_log_size=4096;
+
+--connection server_1
+--let $big= `SELECT REPEAT("*", 5000)`
+SET gtid_seq_no = 100;
+SET @old_server_id = @@server_id;
+SET server_id = 12;
+BEGIN;
+--disable_query_log
+eval INSERT INTO t2 VALUES (2, CONCAT("Hello ", "$big"));
+eval INSERT INTO t2 VALUES (3, CONCAT("Long data:  ", "$big"));
+--enable_query_log
+INSERT INTO t1 VALUES (10, 4);
+COMMIT;
+SET server_id = @old_server_id;
+SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
+SELECT a, LENGTH(b) FROM t2 ORDER BY a;
+--save_master_pos
+
+--connection server_2
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100";
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select last_trans_retry_count from
+    performance_schema.replication_applier_status_by_worker where
+    last_trans_retry_count > 0, last_trans_retry_count, 1);
+let $assert_text= Performance Schema retries should match with actual retries;
+let $assert_cond= "$ps_value" = $new_retry - $old_retry;
+source include/assert.inc;
+
+SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
+SELECT a, LENGTH(b) FROM t2 ORDER BY a;
+
+--connection server_1
+INSERT INTO t1 VALUES (11,11);
+--disable_query_log
+eval INSERT INTO t2 VALUES (4, "$big");
+--enable_query_log
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
+SELECT a, LENGTH(b) FROM t2 ORDER BY a;
+SET GLOBAL max_relay_log_size=@old_max;
+
+
+--echo *** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
+
+--connection server_2
+--source include/stop_slave.inc
+
+--connection server_1
+BEGIN;
+INSERT INTO t1 VALUES (100, 0);
+INSERT INTO t1 VALUES (101, 0);
+INSERT INTO t1 VALUES (102, 0);
+INSERT INTO t1 VALUES (103, 0);
+COMMIT;
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+--save_master_pos
+
+--connection server_2
+# Inject a DBUG error insert to cause the XID event of the single transaction
+# from the master to fail with a deadlock error and be retried.
+# The bug was that the retry of the XID would leave the relay log position
+# incorrect (off by the size of XID event).
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select last_trans_retry_count from
+    performance_schema.replication_applier_status_by_worker where
+    last_trans_retry_count > 0, last_trans_retry_count, 1);
+let $assert_text= Performance Schema retries should match with actual retries;
+let $assert_cond= "$ps_value" = $new_retry - $old_retry;
+source include/assert.inc;
+
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+# Stop the SQL thread. When the bug was there to give the incorrect relay log
+# position, the restart of the SQL thread would read garbage data from the
+# middle of an event and fail with relay log IO error.
+--source include/stop_slave_sql.inc
+
+--connection server_1
+INSERT INTO t1 VALUES (104, 1);
+INSERT INTO t1 VALUES (105, 1);
+INSERT INTO t1 VALUES (106, 1);
+INSERT INTO t1 VALUES (107, 1);
+INSERT INTO t1 VALUES (108, 1);
+INSERT INTO t1 VALUES (109, 1);
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+
+
+--echo *** MDEV-6917: Parallel replication: "Commit failed due to failure of an earlier commit on which this one depends", but no prior failure seen **
+
+--connection server_1
+CREATE TABLE t3 (a INT PRIMARY KEY, b INT, KEY b_idx(b)) ENGINE=InnoDB;
+INSERT INTO t3 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
+CREATE TABLE t4 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
+# MDEV-515 takes X-lock on the table for the first insert.
+# So concurrent insert won't happen on the table
+INSERT INTO t4 VALUES(100, 100);
+
+# We need statement binlog format to be able to inject debug_sync statements
+# on the slave with calls to foo().
+SET @old_format= @@SESSION.binlog_format;
+SET binlog_format='statement';
+--save_master_pos
+
+--connection server_2
+--sync_with_master
+--source include/stop_slave.inc
+CHANGE MASTER TO master_use_gtid=no;
+
+--connection server_1
+
+# Create a group commit with three transactions T1, T2, T3.
+# T2 will block T1 on the slave where we will make it run first, so it will be
+# deadlock killed.
+# The bug was that in this case, T3 was signalled to fail due to T2 failing,
+# even though the retry of T2 was later successful.
+
+--connect (con1,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+SET @old_format= @@SESSION.binlog_format;
+SET binlog_format='statement';
+BEGIN;
+INSERT INTO t4 VALUES (10, foo(1, 'before_execute_sql_command WAIT_FOR t1_start', ''));
+UPDATE t3 SET b=NULL WHERE a=6;
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
+send COMMIT;
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued1';
+
+--connect (con2,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+SET @old_format= @@SESSION.binlog_format;
+SET binlog_format='statement';
+BEGIN;
+INSERT INTO t4 VALUES (20, foo(2, 'group_commit_waiting_for_prior SIGNAL t2_waiting', ''));
+DELETE FROM t3 WHERE b <= 3;
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
+send COMMIT;
+
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued2';
+
+--connect (con3,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+SET @old_format= @@SESSION.binlog_format;
+SET binlog_format='statement';
+BEGIN;
+INSERT INTO t4 VALUES (30, foo(3, 'before_execute_sql_command WAIT_FOR t3_start', 'group_commit_waiting_for_prior SIGNAL t3_waiting'));
+INSERT INTO t3 VALUES (7,7);
+SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued3';
+send COMMIT;
+
+--connection server_1
+SET debug_sync='now WAIT_FOR master_queued3';
+SET debug_sync='now SIGNAL master_cont1';
+
+--connection con1
+REAP;
+SET binlog_format=@old_format;
+--connection con2
+REAP;
+SET binlog_format=@old_format;
+--connection con3
+REAP;
+SET debug_sync='RESET';
+SET binlog_format=@old_format;
+
+--connection server_1
+--save_master_pos
+SELECT * FROM t3 ORDER BY a;
+
+
+--connection server_2
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+SET @old_dbug=@@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,thd_need_ordering_with_force";
+--source include/start_slave.inc
+# First, wait for T2 to complete up to where it is waiting for T1 to group
+# commit for both of them. This will set locks that will block T1, causing
+# a deadlock kill and retry of T2. T1 and T3 are still blocked at the start
+# of each their SQL statements.
+SET debug_sync='now WAIT_FOR t2_waiting';
+# Now let T3 move on until the point where it is itself ready to commit.
+SET debug_sync='now SIGNAL t3_start';
+SET debug_sync='now WAIT_FOR t3_waiting';
+# Now T2 and T3 are set up, so we can let T1 proceed.
+SET debug_sync='now SIGNAL t1_start';
+# Now we can wait for the slave to catch up.
+# We should see T2 being deadlock killed and retried.
+# The bug was that T2 deadlock kill would cause T3 to fail due to failure
+# of an earlier commit. This is wrong as T2 did not fail, it was only
+# retried.
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;
+SET debug_sync='RESET';
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+let $ps_value= query_get_value(select last_trans_retry_count from
+    performance_schema.replication_applier_status_by_worker where
+    last_trans_retry_count > 0, last_trans_retry_count, 1);
+let $assert_text= Performance Schema retries should match with actual retries;
+let $assert_cond= "$ps_value" = $new_retry - $old_retry;
+source include/assert.inc;
+SELECT * FROM t3 ORDER BY a;
+
+
+--connection server_1
+SET binlog_format=@old_format;
+
+
+# Clean up of the above part.
+--connection server_2
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+--source include/start_slave.inc
+
+--connection server_1
+DROP TABLE t1, t2, t3, t4;
+DROP function foo;
+
+--sync_slave_with_master server_2
+
+#
+# MDEV-12746 rpl.rpl_parallel_optimistic_nobinlog fails committing out of order at retry
+#
+
+--connection server_1
+CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
+# MDEV-515 takes X-lock on the table for the first insert.
+# So concurrent insert won't happen on the table
+INSERT INTO t1 VALUES(100, 100);
+
+# Replicate create-t1 and prepare to re-start slave in optimistic mode
+--sync_slave_with_master server_2
+--source include/stop_slave.inc
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+SET @@GLOBAL.slave_parallel_threads=5;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
+SET @@GLOBAL.slave_parallel_mode='aggressive';
+SET @old_lock_wait_timeout=@@GLOBAL.innodb_lock_wait_timeout;
+SET @@GLOBAL.innodb_lock_wait_timeout=2;
+SET @old_slave_transaction_retries=@@GLOBAL.slave_transaction_retries;
+SET @@GLOBAL.slave_transaction_retries=1;
+
+--echo # Spoilers on the slave side causing temporary errors
+--connect (spoiler_21,127.0.0.1,root,,test,$SLAVE_MYPORT)
+BEGIN;
+  INSERT INTO t1 SET a=1,b=2;
+
+--connect (spoiler_22,127.0.0.1,root,,test,$SLAVE_MYPORT)
+BEGIN;
+  INSERT INTO t1 SET a=2,b=2;
+
+--echo # Master payload
+--connection server_1
+SET @@SESSION.GTID_SEQ_NO=1000;
+INSERT INTO t1 SET a=1,b=1;
+SET @@SESSION.GTID_SEQ_NO=1001;
+INSERT INTO t1 SET a=2,b=1;
+
+--echo # Start slave whose both appliers is destined to being blocked
+--connection server_2
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET @@GLOBAL.debug_dbug="+d,rpl_parallel_simulate_wait_at_retry";
+--source include/start_slave.inc
+
+--echo # Make sure both workers are waiting at their sync points
+--let $wait_condition= SELECT count(*)=2 FROM information_schema.processlist WHERE state LIKE '%debug sync point%';
+--source include/wait_condition.inc
+
+
+--echo # Signal to the 1st to proceed after it has reached termination state
+SET @@DEBUG_SYNC='now SIGNAL proceed_by_1000';
+--connection spoiler_21
+ROLLBACK;
+
+--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE '%debug sync point%';
+--source include/wait_condition.inc
+
+--echo # Release the 2nd worker to proceed
+--connection spoiler_22
+ROLLBACK;
+--connection server_2
+SET @@DEBUG_SYNC='now SIGNAL proceed_by_1001';
+
+--echo # observe how it all ends
+if (`SELECT count(*) = 1 FROM t1 WHERE a = 1`)
+{
+  --echo "*** Unexpected commit by the first Worker ***"
+  SELECT * from t1;
+  --die
+}
+
+--echo # Wait for the workers to go home and check the result of applying
+--let $wait_condition=SELECT count(*) = 0 FROM information_schema.processlist WHERE command = 'Slave_worker'
+--source include/wait_condition.inc
+if (`SELECT count(*) = 1 FROM t1 WHERE a = 2`)
+{
+  --echo
+  --echo "*** Error: congrats, you hit MDEV-12746 issue. ***"
+  --echo
+  --die
+}
+--echo # which is OK
+
+#
+# Clean up
+#
+--connection server_2
+--source include/stop_slave.inc
+SET @@GLOBAL.slave_parallel_threads=@old_parallel_threads;
+SET @@GLOBAL.slave_parallel_mode=@old_parallel_mode;
+SET @@GLOBAL.innodb_lock_wait_timeout=@old_lock_wait_timeout;
+SET @@GLOBAL.slave_transaction_retries=@old_slave_transaction_retries;
+SET @@GLOBAL.debug_dbug=@old_dbug;
+SET debug_sync='RESET';
+--source include/start_slave.inc
+
+--connection server_1
+DROP TABLE t1;
+
+--sync_slave_with_master server_2
+
+--source include/rpl_end.inc