Adding upstream version 1:10.5.12.upstream/1%10.5.12 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:07:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:07:14 +0000
commit: a175314c3e5827eb193872241446f2f8f5c9d33c (patch)
tree: cd3d60ca99ae00829c52a6ca79150a5b6e62528b /sql/rpl_parallel.cc
parent: Initial commit. (diff)
download: mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz
mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip
1 files changed, 2983 insertions, 0 deletions
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
new file mode 100644
index 00000000..65d5a06a
--- /dev/null
+++ b/sql/rpl_parallel.cc
@@ -0,0 +1,2983 @@
+#include "mariadb.h"
+#include "rpl_parallel.h"
+#include "slave.h"
+#include "rpl_mi.h"
+#include "sql_parse.h"
+#include "debug_sync.h"
+#include "sql_repl.h"
+#include "wsrep_mysqld.h"
+#ifdef WITH_WSREP
+#include "wsrep_trans_observer.h"
+#endif
+
+/*
+  Code for optional parallel execution of replicated events on the slave.
+*/
+
+
+/*
+  Maximum number of queued events to accumulate in a local free list, before
+  moving them to the global free list. There is additional a limit of how much
+  to accumulate based on opt_slave_parallel_max_queued.
+*/
+#define QEV_BATCH_FREE 200
+
+
+struct rpl_parallel_thread_pool global_rpl_thread_pool;
+
+static void signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi,
+                                              int err);
+
+static int
+rpt_handle_event(rpl_parallel_thread::queued_event *qev,
+                 struct rpl_parallel_thread *rpt)
+{
+  int err;
+  rpl_group_info *rgi= qev->rgi;
+  Relay_log_info *rli= rgi->rli;
+  THD *thd= rgi->thd;
+  Log_event *ev;
+
+  DBUG_ASSERT(qev->typ == rpl_parallel_thread::queued_event::QUEUED_EVENT);
+  ev= qev->ev;
+#ifdef WITH_WSREP
+  if (wsrep_before_statement(thd))
+  {
+    WSREP_WARN("Parallel slave failed at wsrep_before_statement() hook");
+    return(1);
+  }
+#endif /* WITH_WSREP */
+
+  thd->system_thread_info.rpl_sql_info->rpl_filter = rli->mi->rpl_filter;
+  ev->thd= thd;
+
+  strcpy(rgi->event_relay_log_name_buf, qev->event_relay_log_name);
+  rgi->event_relay_log_name= rgi->event_relay_log_name_buf;
+  rgi->event_relay_log_pos= qev->event_relay_log_pos;
+  rgi->future_event_relay_log_pos= qev->future_event_relay_log_pos;
+  strcpy(rgi->future_event_master_log_name, qev->future_event_master_log_name);
+  if (!(ev->is_artificial_event() || ev->is_relay_log_event() ||
+        (ev->when == 0)))
+    rgi->last_master_timestamp= ev->when + (time_t)ev->exec_time;
+  err= apply_event_and_update_pos_for_parallel(ev, thd, rgi);
+
+  rli->executed_entries++;
+#ifdef WITH_WSREP
+  if (wsrep_after_statement(thd))
+  {
+    WSREP_WARN("Parallel slave failed at wsrep_after_statement() hook");
+    err= 1;
+  }
+#endif /* WITH_WSREP */
+  /* ToDo: error handling. */
+  return err;
+}
+
+
+static void
+handle_queued_pos_update(THD *thd, rpl_parallel_thread::queued_event *qev)
+{
+  int cmp;
+  Relay_log_info *rli;
+  rpl_parallel_entry *e;
+
+  /*
+    Events that are not part of an event group, such as Format Description,
+    Stop, GTID List and such, are executed directly in the driver SQL thread,
+    to keep the relay log state up-to-date. But the associated position update
+    is done here, in sync with other normal events as they are queued to
+    worker threads.
+  */
+  if ((thd->variables.option_bits & OPTION_BEGIN) &&
+      opt_using_transactions)
+    return;
+
+  /* Do not update position if an earlier event group caused an error abort. */
+  DBUG_ASSERT(qev->typ == rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE);
+  rli= qev->rgi->rli;
+  e= qev->entry_for_queued;
+  if (e->stop_on_error_sub_id < (uint64)ULONGLONG_MAX ||
+      (e->force_abort && !rli->stop_for_until))
+    return;
+
+  mysql_mutex_lock(&rli->data_lock);
+  cmp= compare_log_name(rli->group_relay_log_name, qev->event_relay_log_name);
+  if (cmp < 0)
+  {
+    rli->group_relay_log_pos= qev->future_event_relay_log_pos;
+    strmake_buf(rli->group_relay_log_name, qev->event_relay_log_name);
+  } else if (cmp == 0 &&
+             rli->group_relay_log_pos < qev->future_event_relay_log_pos)
+    rli->group_relay_log_pos= qev->future_event_relay_log_pos;
+
+  cmp= compare_log_name(rli->group_master_log_name, qev->future_event_master_log_name);
+  if (cmp < 0)
+  {
+    strcpy(rli->group_master_log_name, qev->future_event_master_log_name);
+    rli->group_master_log_pos= qev->future_event_master_log_pos;
+  }
+  else if (cmp == 0
+           && rli->group_master_log_pos < qev->future_event_master_log_pos)
+    rli->group_master_log_pos= qev->future_event_master_log_pos;
+  mysql_mutex_unlock(&rli->data_lock);
+  mysql_cond_broadcast(&rli->data_cond);
+}
+
+
+/*
+  Wait for any pending deadlock kills. Since deadlock kills happen
+  asynchronously, we need to be sure they will be completed before starting a
+  new transaction. Otherwise the new transaction might suffer a spurious kill.
+*/
+static void
+wait_for_pending_deadlock_kill(THD *thd, rpl_group_info *rgi)
+{
+  PSI_stage_info old_stage;
+
+  mysql_mutex_lock(&thd->LOCK_wakeup_ready);
+  thd->ENTER_COND(&thd->COND_wakeup_ready, &thd->LOCK_wakeup_ready,
+                  &stage_waiting_for_deadlock_kill, &old_stage);
+  while (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
+    mysql_cond_wait(&thd->COND_wakeup_ready, &thd->LOCK_wakeup_ready);
+  thd->EXIT_COND(&old_stage);
+}
+
+
+static void
+finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
+                   rpl_parallel_entry *entry, rpl_group_info *rgi)
+{
+  THD *thd= rpt->thd;
+  wait_for_commit *wfc= &rgi->commit_orderer;
+  int err;
+
+  thd->get_stmt_da()->set_overwrite_status(true);
+  /*
+    Remove any left-over registration to wait for a prior commit to
+    complete. Normally, such wait would already have been removed at
+    this point by wait_for_prior_commit() called from within COMMIT
+    processing. However, in case of MyISAM and no binlog, we might not
+    have any commit processing, and so we need to do the wait here,
+    before waking up any subsequent commits, to preserve correct
+    order of event execution. Also, in the error case we might have
+    skipped waiting and thus need to remove it explicitly.
+
+    It is important in the non-error case to do a wait, not just an
+    unregister. Because we might be last in a group-commit that is
+    replicated in parallel, and the following event will then wait
+    for us to complete and rely on this also ensuring that any other
+    event in the group has completed.
+
+    And in the error case, correct GCO lifetime relies on the fact that once
+    the last event group in the GCO has executed wait_for_prior_commit(),
+    all earlier event groups have also committed; this way no more
+    mark_start_commit() calls can be made and it is safe to de-allocate
+    the GCO.
+  */
+  err= wfc->wait_for_prior_commit(thd);
+  if (unlikely(err) && !rgi->worker_error)
+    signal_error_to_sql_driver_thread(thd, rgi, err);
+  thd->wait_for_commit_ptr= NULL;
+
+  mysql_mutex_lock(&entry->LOCK_parallel_entry);
+  /*
+    We need to mark that this event group started its commit phase, in case we
+    missed it before (otherwise we would deadlock the next event group that is
+    waiting for this). In most cases (normal DML), it will be a no-op.
+  */
+  rgi->mark_start_commit_no_lock();
+
+  if (entry->last_committed_sub_id < sub_id)
+  {
+    /*
+      Record that this event group has finished (eg. transaction is
+      committed, if transactional), so other event groups will no longer
+      attempt to wait for us to commit. Once we have increased
+      entry->last_committed_sub_id, no other threads will execute
+      register_wait_for_prior_commit() against us. Thus, by doing one
+      extra (usually redundant) wakeup_subsequent_commits() we can ensure
+      that no register_wait_for_prior_commit() can ever happen without a
+      subsequent wakeup_subsequent_commits() to wake it up.
+
+      We can race here with the next transactions, but that is fine, as
+      long as we check that we do not decrease last_committed_sub_id. If
+      this commit is done, then any prior commits will also have been
+      done and also no longer need waiting for.
+    */
+    entry->last_committed_sub_id= sub_id;
+    if (entry->need_sub_id_signal)
+      mysql_cond_broadcast(&entry->COND_parallel_entry);
+
+    /* Now free any GCOs in which all transactions have committed. */
+    group_commit_orderer *tmp_gco= rgi->gco;
+    while (tmp_gco &&
+           (!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id ||
+            tmp_gco->next_gco->wait_count > entry->count_committing_event_groups))
+    {
+      /*
+        We must not free a GCO before the wait_count of the following GCO has
+        been reached and wakeup has been sent. Otherwise we will lose the
+        wakeup and hang (there were several such bugs in the past).
+
+        The intention is that this is ensured already since we only free when
+        the last event group in the GCO has committed
+        (tmp_gco->last_sub_id <= sub_id). However, if we have a bug, we have
+        extra check on next_gco->wait_count to hopefully avoid hanging; we
+        have here an assertion in debug builds that this check does not in
+        fact trigger.
+      */
+      DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id);
+      tmp_gco= tmp_gco->prev_gco;
+    }
+    while (tmp_gco)
+    {
+      group_commit_orderer *prev_gco= tmp_gco->prev_gco;
+      tmp_gco->next_gco->prev_gco= NULL;
+      rpt->loc_free_gco(tmp_gco);
+      tmp_gco= prev_gco;
+    }
+  }
+
+  /*
+    If this event group got error, then any following event groups that have
+    not yet started should just skip their group, preparing for stop of the
+    SQL driver thread.
+  */
+  if (unlikely(rgi->worker_error) &&
+      entry->stop_on_error_sub_id == (uint64)ULONGLONG_MAX)
+    entry->stop_on_error_sub_id= sub_id;
+  mysql_mutex_unlock(&entry->LOCK_parallel_entry);
+  DBUG_EXECUTE_IF("hold_worker_on_schedule", {
+      if (entry->stop_on_error_sub_id < (uint64)ULONGLONG_MAX)
+      {
+        debug_sync_set_action(thd, STRING_WITH_LEN("now SIGNAL continue_worker"));
+      }
+    });
+
+  DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
+      if (rgi->current_gtid.seq_no == 1000) {
+        DBUG_ASSERT(entry->stop_on_error_sub_id == sub_id);
+        debug_sync_set_action(thd,
+                              STRING_WITH_LEN("now WAIT_FOR proceed_by_1000"));
+      }
+    });
+
+  if (rgi->killed_for_retry == rpl_group_info::RETRY_KILL_PENDING)
+    wait_for_pending_deadlock_kill(thd, rgi);
+  thd->clear_error();
+  thd->reset_killed();
+  /*
+    Would do thd->get_stmt_da()->set_overwrite_status(false) here, but
+    reset_diagnostics_area() already does that.
+  */
+  thd->get_stmt_da()->reset_diagnostics_area();
+  wfc->wakeup_subsequent_commits(rgi->worker_error);
+}
+
+
+static void
+signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
+{
+  rgi->worker_error= err;
+  /*
+    In case we get an error during commit, inform following transactions that
+    we aborted our commit.
+  */
+  rgi->unmark_start_commit();
+  rgi->cleanup_context(thd, true);
+  rgi->rli->abort_slave= true;
+  rgi->rli->stop_for_until= false;
+  mysql_mutex_lock(rgi->rli->relay_log.get_log_lock());
+  rgi->rli->relay_log.signal_relay_log_update();
+  mysql_mutex_unlock(rgi->rli->relay_log.get_log_lock());
+}
+
+
+static void
+unlock_or_exit_cond(THD *thd, mysql_mutex_t *lock, bool *did_enter_cond,
+                    PSI_stage_info *old_stage)
+{
+  if (*did_enter_cond)
+  {
+    thd->EXIT_COND(old_stage);
+    *did_enter_cond= false;
+  }
+  else
+    mysql_mutex_unlock(lock);
+}
+
+
+static void
+register_wait_for_prior_event_group_commit(rpl_group_info *rgi,
+                                           rpl_parallel_entry *entry)
+{
+  mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
+  if (rgi->wait_commit_sub_id > entry->last_committed_sub_id)
+  {
+    /*
+      Register that the commit of this event group must wait for the
+      commit of the previous event group to complete before it may
+      complete itself, so that we preserve commit order.
+    */
+    wait_for_commit *waitee=
+      &rgi->wait_commit_group_info->commit_orderer;
+    rgi->commit_orderer.register_wait_for_prior_commit(waitee);
+  }
+}
+
+
+/*
+  Do not start parallel execution of this event group until all prior groups
+  have reached the commit phase that are not safe to run in parallel with.
+*/
+static bool
+do_gco_wait(rpl_group_info *rgi, group_commit_orderer *gco,
+            bool *did_enter_cond, PSI_stage_info *old_stage)
+{
+  THD *thd= rgi->thd;
+  rpl_parallel_entry *entry= rgi->parallel_entry;
+  uint64 wait_count;
+
+  mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
+
+  if (!gco->installed)
+  {
+    group_commit_orderer *prev_gco= gco->prev_gco;
+    if (prev_gco)
+    {
+      prev_gco->last_sub_id= gco->prior_sub_id;
+      prev_gco->next_gco= gco;
+    }
+    gco->installed= true;
+  }
+  wait_count= gco->wait_count;
+  if (wait_count > entry->count_committing_event_groups)
+  {
+    DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior");
+    thd->ENTER_COND(&gco->COND_group_commit_orderer,
+                    &entry->LOCK_parallel_entry,
+                    &stage_waiting_for_prior_transaction_to_start_commit,
+                    old_stage);
+    *did_enter_cond= true;
+    thd->set_time_for_next_stage();
+    do
+    {
+      if (!rgi->worker_error && unlikely(thd->check_killed(1)))
+      {
+        DEBUG_SYNC(thd, "rpl_parallel_start_waiting_for_prior_killed");
+        thd->clear_error();
+        thd->get_stmt_da()->reset_diagnostics_area();
+        thd->send_kill_message();
+        slave_output_error_info(rgi, thd);
+        signal_error_to_sql_driver_thread(thd, rgi, 1);
+        /*
+          Even though we were killed, we need to continue waiting for the
+          prior event groups to signal that we can continue. Otherwise we
+          mess up the accounting for ordering. However, now that we have
+          marked the error, events will just be skipped rather than
+          executed, and things will progress quickly towards stop.
+        */
+      }
+      mysql_cond_wait(&gco->COND_group_commit_orderer,
+                      &entry->LOCK_parallel_entry);
+    } while (wait_count > entry->count_committing_event_groups);
+  }
+
+  if (entry->force_abort && wait_count > entry->stop_count)
+  {
+    /*
+      We are stopping (STOP SLAVE), and this event group is beyond the point
+      where we can safely stop. So return a flag that will cause us to skip,
+      rather than execute, the following events.
+    */
+    return true;
+  }
+  else
+    return false;
+}
+
+
+static bool
+do_ftwrl_wait(rpl_group_info *rgi,
+              bool *did_enter_cond, PSI_stage_info *old_stage)
+{
+  THD *thd= rgi->thd;
+  rpl_parallel_entry *entry= rgi->parallel_entry;
+  uint64 sub_id= rgi->gtid_sub_id;
+  bool aborted= false;
+  DBUG_ENTER("do_ftwrl_wait");
+
+  mysql_mutex_assert_owner(&entry->LOCK_parallel_entry);
+
+  /*
+    If a FLUSH TABLES WITH READ LOCK (FTWRL) is pending, check if this
+    transaction is later than transactions that have priority to complete
+    before FTWRL. If so, wait here so that FTWRL can proceed and complete
+    first.
+
+    (entry->pause_sub_id is ULONGLONG_MAX if no FTWRL is pending, which makes
+    this test false as required).
+  */
+  if (unlikely(sub_id > entry->pause_sub_id))
+  {
+    thd->ENTER_COND(&entry->COND_parallel_entry, &entry->LOCK_parallel_entry,
+                    &stage_waiting_for_ftwrl, old_stage);
+    *did_enter_cond= true;
+    thd->set_time_for_next_stage();
+    do
+    {
+      if (entry->force_abort || rgi->worker_error)
+      {
+        aborted= true;
+        break;
+      }
+      if (unlikely(thd->check_killed()))
+      {
+        slave_output_error_info(rgi, thd);
+        signal_error_to_sql_driver_thread(thd, rgi, 1);
+        break;
+      }
+      mysql_cond_wait(&entry->COND_parallel_entry, &entry->LOCK_parallel_entry);
+    } while (sub_id > entry->pause_sub_id);
+
+    /*
+      We do not call EXIT_COND() here, as this will be done later by our
+      caller (since we set *did_enter_cond to true).
+    */
+  }
+
+  if (sub_id > entry->largest_started_sub_id)
+    entry->largest_started_sub_id= sub_id;
+
+  DBUG_RETURN(aborted);
+}
+
+
+static int
+pool_mark_busy(rpl_parallel_thread_pool *pool, THD *thd)
+{
+  PSI_stage_info old_stage;
+  int res= 0;
+
+  /*
+    Wait here while the queue is busy. This is done to make FLUSH TABLES WITH
+    READ LOCK work correctly, without incuring extra locking penalties in
+    normal operation. FLUSH TABLES WITH READ LOCK needs to lock threads in the
+    thread pool, and for this we need to make sure the pool will not go away
+    during the operation. The LOCK_rpl_thread_pool is not suitable for
+    this. It is taken by release_thread() while holding LOCK_rpl_thread; so it
+    must be released before locking any LOCK_rpl_thread lock, or a deadlock
+    can occur.
+
+    So we protect the infrequent operations of FLUSH TABLES WITH READ LOCK and
+    pool size changes with this condition wait.
+  */
+  DBUG_EXECUTE_IF("mark_busy_mdev_22370",my_sleep(1000000););
+  mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
+  if (thd)
+  {
+    thd->ENTER_COND(&pool->COND_rpl_thread_pool, &pool->LOCK_rpl_thread_pool,
+                    &stage_waiting_for_rpl_thread_pool, &old_stage);
+    thd->set_time_for_next_stage();
+  }
+  while (pool->busy)
+  {
+    if (thd && unlikely(thd->check_killed()))
+    {
+      res= 1;
+      break;
+    }
+    mysql_cond_wait(&pool->COND_rpl_thread_pool, &pool->LOCK_rpl_thread_pool);
+  }
+  if (!res)
+    pool->busy= true;
+  if (thd)
+    thd->EXIT_COND(&old_stage);
+  else
+    mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
+
+  return res;
+}
+
+
+static void
+pool_mark_not_busy(rpl_parallel_thread_pool *pool)
+{
+  mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
+  DBUG_ASSERT(pool->busy);
+  pool->busy= false;
+  mysql_cond_broadcast(&pool->COND_rpl_thread_pool);
+  mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
+}
+
+
+void
+rpl_unpause_after_ftwrl(THD *thd)
+{
+  uint32 i;
+  rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
+  DBUG_ENTER("rpl_unpause_after_ftwrl");
+
+  DBUG_ASSERT(pool->busy);
+
+  for (i= 0; i < pool->count; ++i)
+  {
+    rpl_parallel_entry *e;
+    rpl_parallel_thread *rpt= pool->threads[i];
+
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    if (!rpt->current_owner)
+    {
+      mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+      continue;
+    }
+    e= rpt->current_entry;
+    mysql_mutex_lock(&e->LOCK_parallel_entry);
+    rpt->pause_for_ftwrl = false;
+    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+    /*
+      Do not change pause_sub_id if force_abort is set.
+      force_abort is set in case of STOP SLAVE.
+
+      Reason: If pause_sub_id is not changed and force_abort_is set,
+      any parallel slave thread waiting in do_ftwrl_wait() will
+      on wakeup return from do_ftwrl_wait() with 1. This will set
+      skip_event_group to 1 in handle_rpl_parallel_thread() and the
+      parallel thread will abort at once.
+
+      If pause_sub_id is changed, the code in handle_rpl_parallel_thread()
+      would continue to execute the transaction in the queue, which would
+      cause some transactions to be lost.
+    */
+    if (!e->force_abort)
+      e->pause_sub_id= (uint64)ULONGLONG_MAX;
+    mysql_cond_broadcast(&e->COND_parallel_entry);
+    mysql_mutex_unlock(&e->LOCK_parallel_entry);
+  }
+
+  pool_mark_not_busy(pool);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  .
+
+  Note: in case of error return, rpl_unpause_after_ftwrl() must _not_ be called.
+*/
+int
+rpl_pause_for_ftwrl(THD *thd)
+{
+  uint32 i;
+  rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
+  int err;
+  DBUG_ENTER("rpl_pause_for_ftwrl");
+
+  /*
+    While the count_pending_pause_for_ftwrl counter is non-zero, the pool
+    cannot be shutdown/resized, so threads are guaranteed to not disappear.
+
+    This is required to safely be able to access the individual threads below.
+    (We cannot lock an individual thread while holding LOCK_rpl_thread_pool,
+    as this can deadlock against release_thread()).
+  */
+  if ((err= pool_mark_busy(pool, thd)))
+    DBUG_RETURN(err);
+
+  for (i= 0; i < pool->count; ++i)
+  {
+    PSI_stage_info old_stage;
+    rpl_parallel_entry *e;
+    rpl_parallel_thread *rpt= pool->threads[i];
+
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    if (!rpt->current_owner)
+    {
+      mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+      continue;
+    }
+    e= rpt->current_entry;
+    mysql_mutex_lock(&e->LOCK_parallel_entry);
+    /*
+      Setting the rpt->pause_for_ftwrl flag makes sure that the thread will not
+      de-allocate itself until signalled to do so by rpl_unpause_after_ftwrl().
+    */
+    rpt->pause_for_ftwrl = true;
+    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+    ++e->need_sub_id_signal;
+    if (e->pause_sub_id == (uint64)ULONGLONG_MAX)
+      e->pause_sub_id= e->largest_started_sub_id;
+    thd->ENTER_COND(&e->COND_parallel_entry, &e->LOCK_parallel_entry,
+                    &stage_waiting_for_ftwrl_threads_to_pause, &old_stage);
+    thd->set_time_for_next_stage();
+    while (e->pause_sub_id < (uint64)ULONGLONG_MAX &&
+           e->last_committed_sub_id < e->pause_sub_id &&
+           !err)
+    {
+      if (unlikely(thd->check_killed()))
+      {
+        err= 1;
+        break;
+      }
+      mysql_cond_wait(&e->COND_parallel_entry, &e->LOCK_parallel_entry);
+    };
+    --e->need_sub_id_signal;
+    thd->EXIT_COND(&old_stage);
+    if (err)
+      break;
+  }
+
+  if (err)
+    rpl_unpause_after_ftwrl(thd);
+  DBUG_RETURN(err);
+}
+
+
+#ifndef DBUG_OFF
+static int
+dbug_simulate_tmp_error(rpl_group_info *rgi, THD *thd)
+{
+  if (rgi->current_gtid.domain_id == 0 && rgi->current_gtid.seq_no == 100 &&
+      rgi->retry_event_count == 4)
+  {
+    thd->clear_error();
+    thd->get_stmt_da()->reset_diagnostics_area();
+    my_error(ER_LOCK_DEADLOCK, MYF(0));
+    return 1;
+  }
+  return 0;
+}
+#endif
+
+
+/*
+  If we detect a deadlock due to eg. storage engine locks that conflict with
+  the fixed commit order, then the later transaction will be killed
+  asynchroneously to allow the former to complete its commit.
+
+  In this case, we convert the 'killed' error into a deadlock error, and retry
+  the later transaction.
+
+  If we are doing optimistic parallel apply of transactions not known to be
+  safe, we convert any error to a deadlock error, but then at retry we will
+  wait for prior transactions to commit first, so that the retries can be
+  done non-speculative.
+*/
+static void
+convert_kill_to_deadlock_error(rpl_group_info *rgi)
+{
+  THD *thd= rgi->thd;
+  int err_code;
+
+  if (!thd->get_stmt_da()->is_error())
+    return;
+  err_code= thd->get_stmt_da()->sql_errno();
+  if ((rgi->speculation == rpl_group_info::SPECULATE_OPTIMISTIC &&
+       err_code != ER_PRIOR_COMMIT_FAILED) ||
+      ((err_code == ER_QUERY_INTERRUPTED || err_code == ER_CONNECTION_KILLED) &&
+       rgi->killed_for_retry))
+  {
+    thd->clear_error();
+    my_error(ER_LOCK_DEADLOCK, MYF(0));
+    thd->reset_killed();
+  }
+}
+
+
+/*
+  Check if an event marks the end of an event group. Returns non-zero if so,
+  zero otherwise.
+
+  In addition, returns 1 if the group is committing, 2 if it is rolling back.
+*/
+static int
+is_group_ending(Log_event *ev, Log_event_type event_type)
+{
+  if (event_type == XID_EVENT || event_type == XA_PREPARE_LOG_EVENT)
+    return 1;
+  if (event_type == QUERY_EVENT)  // COMMIT/ROLLBACK are never compressed
+  {
+    Query_log_event *qev = (Query_log_event *)ev;
+    if (qev->is_commit() ||
+        !strncmp(qev->query, STRING_WITH_LEN("XA COMMIT")) ||
+        !strncmp(qev->query, STRING_WITH_LEN("XA ROLLBACK")))
+      return 1;
+    if (qev->is_rollback())
+      return 2;
+  }
+  return 0;
+}
+
+
+static int
+retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
+                  rpl_parallel_thread::queued_event *orig_qev)
+{
+  IO_CACHE rlog;
+  LOG_INFO linfo;
+  File fd= (File)-1;
+  const char *errmsg;
+  inuse_relaylog *ir= rgi->relay_log;
+  uint64 event_count;
+  uint64 events_to_execute= rgi->retry_event_count;
+  Relay_log_info *rli= rgi->rli;
+  int err;
+  ulonglong cur_offset, old_offset;
+  char log_name[FN_REFLEN];
+  THD *thd= rgi->thd;
+  rpl_parallel_entry *entry= rgi->parallel_entry;
+  ulong retries= 0;
+  Format_description_log_event *description_event= NULL;
+
+do_retry:
+  event_count= 0;
+  err= 0;
+  errmsg= NULL;
+
+  /*
+    If we already started committing before getting the deadlock (or other
+    error) that caused us to need to retry, we have already signalled
+    subsequent transactions that we have started committing. This is
+    potentially a problem, as now we will rollback, and if subsequent
+    transactions would start to execute now, they could see an unexpected
+    state of the database and get eg. key not found or duplicate key error.
+
+    However, to get a deadlock in the first place, there must have been
+    another earlier transaction that is waiting for us. Thus that other
+    transaction has _not_ yet started to commit, and any subsequent
+    transactions will still be waiting at this point.
+
+    So here, we decrement back the count of transactions that started
+    committing (if we already incremented it), undoing the effect of an
+    earlier mark_start_commit(). Then later, when the retry succeeds and we
+    commit again, we can do a new mark_start_commit() and eventually wake up
+    subsequent transactions at the proper time.
+
+    We need to do the unmark before the rollback, to be sure that the
+    transaction we deadlocked with will not signal that it started to commit
+    until after the unmark.
+  */
+  DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);});
+  rgi->unmark_start_commit();
+  DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark");
+
+  /*
+    We might get the deadlock error that causes the retry during commit, while
+    sitting in wait_for_prior_commit(). If this happens, we will have a
+    pending error in the wait_for_commit object. So clear this by
+    unregistering (and later re-registering) the wait.
+  */
+  if(thd->wait_for_commit_ptr)
+    thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
+  DBUG_EXECUTE_IF("inject_mdev8031", {
+      /* Simulate that we get deadlock killed at this exact point. */
+      rgi->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
+      thd->set_killed(KILL_CONNECTION);
+  });
+  DBUG_EXECUTE_IF("rpl_parallel_simulate_wait_at_retry", {
+      if (rgi->current_gtid.seq_no == 1001) {
+        debug_sync_set_action(thd,
+                              STRING_WITH_LEN("rpl_parallel_simulate_wait_at_retry WAIT_FOR proceed_by_1001"));
+      }
+      DEBUG_SYNC(thd, "rpl_parallel_simulate_wait_at_retry");
+    });
+
+  rgi->cleanup_context(thd, 1);
+  wait_for_pending_deadlock_kill(thd, rgi);
+  thd->reset_killed();
+  thd->clear_error();
+  rgi->killed_for_retry = rpl_group_info::RETRY_KILL_NONE;
+
+  /*
+    If we retry due to a deadlock kill that occurred during the commit step, we
+    might have already updated (but not committed) an update of table
+    mysql.gtid_slave_pos, and cleared the gtid_pending flag. Now we have
+    rolled back any such update, so we must set the gtid_pending flag back to
+    true so that we will do a new update when/if we succeed with the retry.
+  */
+  rgi->gtid_pending= true;
+
+  mysql_mutex_lock(&rli->data_lock);
+  ++rli->retried_trans;
+  statistic_increment(slave_retried_transactions, LOCK_status);
+  mysql_mutex_unlock(&rli->data_lock);
+
+  for (;;)
+  {
+    mysql_mutex_lock(&entry->LOCK_parallel_entry);
+    if (entry->stop_on_error_sub_id == (uint64) ULONGLONG_MAX ||
+#ifndef DBUG_OFF
+        (DBUG_EVALUATE_IF("simulate_mdev_12746", 1, 0)) ||
+#endif
+        rgi->gtid_sub_id < entry->stop_on_error_sub_id)
+    {
+      register_wait_for_prior_event_group_commit(rgi, entry);
+    }
+    else
+    {
+      /*
+        A failure of a preceding "parent" transaction may not be
+        seen by the current one through its own worker_error.
+        Such induced error gets set by ourselves now.
+      */
+      err= rgi->worker_error= 1;
+      my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
+      mysql_mutex_unlock(&entry->LOCK_parallel_entry);
+      goto err;
+    }
+    mysql_mutex_unlock(&entry->LOCK_parallel_entry);
+
+    /*
+      Let us wait for all prior transactions to complete before trying again.
+      This way, we avoid repeatedly conflicting with and getting deadlock
+      killed by the same earlier transaction.
+    */
+    if (!(err= thd->wait_for_prior_commit()))
+    {
+      rgi->speculation = rpl_group_info::SPECULATE_WAIT;
+      break;
+    }
+
+    convert_kill_to_deadlock_error(rgi);
+    if (!has_temporary_error(thd))
+      goto err;
+    /*
+      If we get a temporary error such as a deadlock kill, we can safely
+      ignore it, as we already rolled back.
+
+      But we still want to retry the wait for the prior transaction to
+      complete its commit.
+    */
+    thd->clear_error();
+    thd->reset_killed();
+    if(thd->wait_for_commit_ptr)
+      thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
+    DBUG_EXECUTE_IF("inject_mdev8031", {
+        /* Inject a small sleep to give prior transaction a chance to commit. */
+        my_sleep(100000);
+    });
+  }
+
+  /*
+    Let us clear any lingering deadlock kill one more time, here after
+    wait_for_prior_commit() has completed. This should rule out any
+    possibility of an old deadlock kill lingering on beyond this point.
+  */
+  thd->reset_killed();
+
+  strmake_buf(log_name, ir->name);
+  if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
+  {
+    err= 1;
+    goto err;
+  }
+  cur_offset= rgi->retry_start_offset;
+  delete description_event;
+  description_event=
+    read_relay_log_description_event(&rlog, cur_offset, &errmsg);
+  if (!description_event)
+  {
+    err= 1;
+    goto err;
+  }
+  DBUG_EXECUTE_IF("inject_mdev8031", {
+      /* Simulate pending KILL caught in read_relay_log_description_event(). */
+      if (unlikely(thd->check_killed()))
+      {
+        err= 1;
+        goto err;
+      }
+  });
+  my_b_seek(&rlog, cur_offset);
+
+  do
+  {
+    Log_event_type event_type;
+    Log_event *ev;
+    rpl_parallel_thread::queued_event *qev;
+
+    /* The loop is here so we can try again the next relay log file on EOF. */
+    for (;;)
+    {
+      old_offset= cur_offset;
+      ev= Log_event::read_log_event(&rlog, description_event,
+                                    opt_slave_sql_verify_checksum);
+      cur_offset= my_b_tell(&rlog);
+
+      if (ev)
+        break;
+      if (unlikely(rlog.error < 0))
+      {
+        errmsg= "slave SQL thread aborted because of I/O error";
+        err= 1;
+        goto check_retry;
+      }
+      if (unlikely(rlog.error > 0))
+      {
+        sql_print_error("Slave SQL thread: I/O error reading "
+                        "event(errno: %d  cur_log->error: %d)",
+                        my_errno, rlog.error);
+        errmsg= "Aborting slave SQL thread because of partial event read";
+        err= 1;
+        goto err;
+      }
+      /* EOF. Move to the next relay log. */
+      end_io_cache(&rlog);
+      mysql_file_close(fd, MYF(MY_WME));
+      fd= (File)-1;
+
+      /* Find the next relay log file. */
+      if((err= rli->relay_log.find_log_pos(&linfo, log_name, 1)) ||
+         (err= rli->relay_log.find_next_log(&linfo, 1)))
+      {
+        char buff[22];
+        sql_print_error("next log error: %d  offset: %s  log: %s",
+                        err,
+                        llstr(linfo.index_file_offset, buff),
+                        log_name);
+        goto err;
+      }
+      strmake_buf(log_name ,linfo.log_file_name);
+
+      DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", {
+          if (retries < 2)
+          {
+            /* Simulate that we get deadlock killed during open_binlog(). */
+            thd->reset_for_next_command();
+            rgi->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
+            thd->set_killed(KILL_CONNECTION);
+            thd->send_kill_message();
+            fd= (File)-1;
+            err= 1;
+            goto check_retry;
+          }
+      });
+      if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
+      {
+        err= 1;
+        goto check_retry;
+      }
+      description_event->reset_crypto();
+      /* Loop to try again on the new log file. */
+    }
+
+    event_type= ev->get_type_code();
+    if (event_type == FORMAT_DESCRIPTION_EVENT)
+    {
+      Format_description_log_event *newde= (Format_description_log_event*)ev;
+      newde->copy_crypto_data(description_event);
+      delete description_event;
+      description_event= newde;
+      continue;
+    }
+    else if (event_type == START_ENCRYPTION_EVENT)
+    {
+      description_event->start_decryption((Start_encryption_log_event*)ev);
+      delete ev;
+      continue;
+    }
+    else if (!Log_event::is_group_event(event_type))
+    {
+      delete ev;
+      continue;
+    }
+    ev->thd= thd;
+
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    qev= rpt->retry_get_qev(ev, orig_qev, log_name, old_offset,
+                            cur_offset - old_offset);
+    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+    if (!qev)
+    {
+      delete ev;
+      my_error(ER_OUT_OF_RESOURCES, MYF(0));
+      err= 1;
+      goto err;
+    }
+    if (is_group_ending(ev, event_type) == 1)
+      rgi->mark_start_commit();
+
+    err= rpt_handle_event(qev, rpt);
+    ++event_count;
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    rpt->free_qev(qev);
+    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+
+    delete_or_keep_event_post_apply(rgi, event_type, ev);
+    DBUG_EXECUTE_IF("rpl_parallel_simulate_double_temp_err_gtid_0_x_100",
+                    if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd););
+    DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100",
+                    err= dbug_simulate_tmp_error(rgi, thd););
+    if (!err)
+      continue;
+
+check_retry:
+    convert_kill_to_deadlock_error(rgi);
+    if (has_temporary_error(thd))
+    {
+      ++retries;
+      if (retries < slave_trans_retries)
+      {
+        if (fd >= 0)
+        {
+          end_io_cache(&rlog);
+          mysql_file_close(fd, MYF(MY_WME));
+          fd= (File)-1;
+        }
+        goto do_retry;
+      }
+      sql_print_error("Slave worker thread retried transaction %lu time(s) "
+                      "in vain, giving up. Consider raising the value of "
+                      "the slave_transaction_retries variable.",
+                      slave_trans_retries);
+    }
+    goto err;
+
+  } while (event_count < events_to_execute);
+
+err:
+
+  if (description_event)
+    delete description_event;
+  if (fd >= 0)
+  {
+    end_io_cache(&rlog);
+    mysql_file_close(fd, MYF(MY_WME));
+  }
+  if (errmsg)
+    sql_print_error("Error reading relay log event: %s", errmsg);
+  return err;
+}
+
+
+pthread_handler_t
+handle_rpl_parallel_thread(void *arg)
+{
+  THD *thd;
+  PSI_stage_info old_stage;
+  struct rpl_parallel_thread::queued_event *events;
+  bool group_standalone= true;
+  bool in_event_group= false;
+  bool skip_event_group= false;
+  rpl_group_info *group_rgi= NULL;
+  group_commit_orderer *gco;
+  uint64 event_gtid_sub_id= 0;
+  rpl_sql_thread_info sql_info(NULL);
+  int err;
+
+  struct rpl_parallel_thread *rpt= (struct rpl_parallel_thread *)arg;
+
+  my_thread_init();
+  thd = new THD(next_thread_id());
+  thd->thread_stack = (char*)&thd;
+  server_threads.insert(thd);
+  set_current_thd(thd);
+  pthread_detach_this_thread();
+  thd->store_globals();
+  thd->init_for_queries();
+  thd->variables.binlog_annotate_row_events= 0;
+  init_thr_lock();
+  thd->system_thread= SYSTEM_THREAD_SLAVE_SQL;
+  thd->security_ctx->skip_grants();
+  thd->variables.max_allowed_packet= slave_max_allowed_packet;
+  /* Ensure that slave can exeute any alter table it gets from master */
+  thd->variables.alter_algorithm= (ulong) Alter_info::ALTER_TABLE_ALGORITHM_DEFAULT;
+  thd->slave_thread= 1;
+
+  set_slave_thread_options(thd);
+  thd->client_capabilities = CLIENT_LOCAL_FILES;
+  thd->net.reading_or_writing= 0;
+  thd_proc_info(thd, "Waiting for work from main SQL threads");
+  thd->variables.lock_wait_timeout= LONG_TIMEOUT;
+  thd->system_thread_info.rpl_sql_info= &sql_info;
+  /*
+    We need to use (at least) REPEATABLE READ isolation level. Otherwise
+    speculative parallel apply can run out-of-order and give wrong results
+    for statement-based replication.
+  */
+  thd->variables.tx_isolation= ISO_REPEATABLE_READ;
+
+  mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+  rpt->thd= thd;
+
+  while (rpt->delay_start)
+    mysql_cond_wait(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread);
+
+  rpt->running= true;
+  mysql_cond_signal(&rpt->COND_rpl_thread);
+
+  thd->set_command(COM_SLAVE_WORKER);
+#ifdef WITH_WSREP
+  wsrep_open(thd);
+  if (wsrep_before_command(thd))
+  {
+    WSREP_WARN("Parallel slave failed at wsrep_before_command() hook");
+    rpt->stop = true;
+  }
+#endif /* WITH_WSREP */
+  while (!rpt->stop)
+  {
+    uint wait_count= 0;
+    rpl_parallel_thread::queued_event *qev, *next_qev;
+
+    thd->ENTER_COND(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread,
+                    &stage_waiting_for_work_from_sql_thread, &old_stage);
+    /*
+      There are 4 cases that should cause us to wake up:
+       - Events have been queued for us to handle.
+       - We have an owner, but no events and not inside event group -> we need
+         to release ourself to the thread pool
+       - SQL thread is stopping, and we have an owner but no events, and we are
+         inside an event group; no more events will be queued to us, so we need
+         to abort the group (force_abort==1).
+       - Thread pool shutdown (rpt->stop==1).
+    */
+    while (!( (events= rpt->event_queue) ||
+              (rpt->current_owner && !in_event_group) ||
+              (rpt->current_owner && group_rgi->parallel_entry->force_abort) ||
+              rpt->stop))
+    {
+      if (!wait_count++)
+        thd->set_time_for_next_stage();
+      mysql_cond_wait(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread);
+    }
+    rpt->dequeue1(events);
+    thd->EXIT_COND(&old_stage);
+
+  more_events:
+    for (qev= events; qev; qev= next_qev)
+    {
+      Log_event_type event_type;
+      rpl_group_info *rgi= qev->rgi;
+      rpl_parallel_entry *entry= rgi->parallel_entry;
+      bool end_of_group;
+      int group_ending;
+
+      next_qev= qev->next;
+      if (qev->typ == rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE)
+      {
+        handle_queued_pos_update(thd, qev);
+        rpt->loc_free_qev(qev);
+        continue;
+      }
+      else if (qev->typ ==
+               rpl_parallel_thread::queued_event::QUEUED_MASTER_RESTART)
+      {
+        if (in_event_group)
+        {
+          /*
+            Master restarted (crashed) in the middle of an event group.
+            So we need to roll back and discard that event group.
+          */
+          group_rgi->cleanup_context(thd, 1);
+          in_event_group= false;
+          finish_event_group(rpt, group_rgi->gtid_sub_id,
+                             qev->entry_for_queued, group_rgi);
+
+          rpt->loc_free_rgi(group_rgi);
+          thd->rgi_slave= group_rgi= NULL;
+        }
+
+        rpt->loc_free_qev(qev);
+        continue;
+      }
+      DBUG_ASSERT(qev->typ==rpl_parallel_thread::queued_event::QUEUED_EVENT);
+
+      thd->rgi_slave= rgi;
+      gco= rgi->gco;
+      /* Handle a new event group, which will be initiated by a GTID event. */
+      if ((event_type= qev->ev->get_type_code()) == GTID_EVENT)
+      {
+        bool did_enter_cond= false;
+        PSI_stage_info old_stage;
+
+        DBUG_EXECUTE_IF("hold_worker_on_schedule", {
+            if (rgi->current_gtid.domain_id == 0 &&
+                rgi->current_gtid.seq_no == 100) {
+                  debug_sync_set_action(thd,
+                STRING_WITH_LEN("now SIGNAL reached_pause WAIT_FOR continue_worker"));
+            }
+          });
+        DBUG_EXECUTE_IF("rpl_parallel_scheduled_gtid_0_x_100", {
+            if (rgi->current_gtid.domain_id == 0 &&
+                rgi->current_gtid.seq_no == 100) {
+              debug_sync_set_action(thd,
+                      STRING_WITH_LEN("now SIGNAL scheduled_gtid_0_x_100"));
+            }
+          });
+
+        if(unlikely(thd->wait_for_commit_ptr) && group_rgi != NULL)
+        {
+          /*
+            This indicates that we get a new GTID event in the middle of
+            a not completed event group. This is corrupt binlog (the master
+            will never write such binlog), so it does not happen unless
+            someone tries to inject wrong crafted binlog, but let us still
+            try to handle it somewhat nicely.
+          */
+          group_rgi->cleanup_context(thd, true);
+          finish_event_group(rpt, group_rgi->gtid_sub_id,
+                             group_rgi->parallel_entry, group_rgi);
+          rpt->loc_free_rgi(group_rgi);
+        }
+
+        thd->tx_isolation= (enum_tx_isolation)thd->variables.tx_isolation;
+        in_event_group= true;
+        /*
+          If the standalone flag is set, then this event group consists of a
+          single statement (possibly preceeded by some Intvar_log_event and
+          similar), without any terminating COMMIT/ROLLBACK/XID.
+        */
+        group_standalone=
+          (0 != (static_cast<Gtid_log_event *>(qev->ev)->flags2 &
+                 Gtid_log_event::FL_STANDALONE));
+
+        event_gtid_sub_id= rgi->gtid_sub_id;
+        rgi->thd= thd;
+
+        mysql_mutex_lock(&entry->LOCK_parallel_entry);
+        skip_event_group= do_gco_wait(rgi, gco, &did_enter_cond, &old_stage);
+
+        if (unlikely(entry->stop_on_error_sub_id <= rgi->wait_commit_sub_id))
+        {
+          skip_event_group= true;
+          rgi->worker_error= 1;
+        }
+        if (likely(!skip_event_group))
+          skip_event_group= do_ftwrl_wait(rgi, &did_enter_cond, &old_stage);
+
+        /*
+          Register ourself to wait for the previous commit, if we need to do
+          such registration _and_ that previous commit has not already
+          occurred.
+        */
+        register_wait_for_prior_event_group_commit(rgi, entry);
+
+        unlock_or_exit_cond(thd, &entry->LOCK_parallel_entry,
+                            &did_enter_cond, &old_stage);
+
+        thd->wait_for_commit_ptr= &rgi->commit_orderer;
+
+        if (opt_gtid_ignore_duplicates &&
+            rgi->rli->mi->using_gtid != Master_info::USE_GTID_NO)
+        {
+          int res=
+            rpl_global_gtid_slave_state->check_duplicate_gtid(&rgi->current_gtid,
+                                                             rgi);
+          if (res < 0)
+          {
+            /* Error. */
+            slave_output_error_info(rgi, thd);
+            signal_error_to_sql_driver_thread(thd, rgi, 1);
+          }
+          else if (!res)
+          {
+            /* GTID already applied by another master connection, skip. */
+            skip_event_group= true;
+          }
+          else
+          {
+            /* We have to apply the event. */
+          }
+        }
+        /*
+          If we are optimistically running transactions in parallel, but this
+          particular event group should not run in parallel with what came
+          before, then wait now for the prior transaction to complete its
+          commit.
+        */
+        if (rgi->speculation == rpl_group_info::SPECULATE_WAIT &&
+            (err= thd->wait_for_prior_commit()))
+        {
+          slave_output_error_info(rgi, thd);
+          signal_error_to_sql_driver_thread(thd, rgi, 1);
+        }
+      }
+
+      group_rgi= rgi;
+      group_ending= is_group_ending(qev->ev, event_type);
+      /*
+        We do not unmark_start_commit() here in case of an explicit ROLLBACK
+        statement. Such events should be very rare, there is no real reason
+        to try to group commit them - on the contrary, it seems best to avoid
+        running them in parallel with following group commits, as with
+        ROLLBACK events we are already deep in dangerous corner cases with
+        mix of transactional and non-transactional tables or the like. And
+        avoiding the mark_start_commit() here allows us to keep an assertion
+        in ha_rollback_trans() that we do not rollback after doing
+        mark_start_commit().
+      */
+      if (group_ending == 1 && likely(!rgi->worker_error))
+      {
+        /*
+          Do an extra check for (deadlock) kill here. This helps prevent a
+          lingering deadlock kill that occurred during normal DML processing to
+          propagate past the mark_start_commit(). If we detect a deadlock only
+          after mark_start_commit(), we have to unmark, which has at least a
+          theoretical possibility of leaving a window where it looks like all
+          transactions in a GCO have started committing, while in fact one
+          will need to rollback and retry. This is not supposed to be possible
+          (since there is a deadlock, at least one transaction should be
+          blocked from reaching commit), but this seems a fragile ensurance,
+          and there were historically a number of subtle bugs in this area.
+        */
+        if (!thd->killed)
+        {
+          DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
+          rgi->mark_start_commit();
+          DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
+        }
+      }
+
+      /*
+        If the SQL thread is stopping, we just skip execution of all the
+        following event groups. We still do all the normal waiting and wakeup
+        processing between the event groups as a simple way to ensure that
+        everything is stopped and cleaned up correctly.
+      */
+      if (likely(!rgi->worker_error) && !skip_event_group)
+      {
+        ++rgi->retry_event_count;
+#ifndef DBUG_OFF
+        err= 0;
+        DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid",
+          if (event_type == XID_EVENT)
+          {
+            thd->clear_error();
+            thd->get_stmt_da()->reset_diagnostics_area();
+            my_error(ER_LOCK_DEADLOCK, MYF(0));
+            err= 1;
+            DEBUG_SYNC(thd, "rpl_parallel_simulate_temp_err_xid");
+          });
+        if (!err)
+#endif
+        {
+          if (unlikely(thd->check_killed()))
+          {
+            thd->clear_error();
+            thd->get_stmt_da()->reset_diagnostics_area();
+            thd->send_kill_message();
+            err= 1;
+          }
+          else
+            err= rpt_handle_event(qev, rpt);
+        }
+        delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
+        DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
+                        err= dbug_simulate_tmp_error(rgi, thd););
+        if (unlikely(err))
+        {
+          convert_kill_to_deadlock_error(rgi);
+          if (has_temporary_error(thd) && slave_trans_retries > 0)
+            err= retry_event_group(rgi, rpt, qev);
+        }
+      }
+      else
+      {
+        delete qev->ev;
+        thd->get_stmt_da()->set_overwrite_status(true);
+        err= thd->wait_for_prior_commit();
+        thd->get_stmt_da()->set_overwrite_status(false);
+      }
+
+      end_of_group=
+        in_event_group &&
+        ((group_standalone && !Log_event::is_part_of_group(event_type)) ||
+         group_ending);
+
+      rpt->loc_free_qev(qev);
+
+      if (unlikely(err))
+      {
+        if (!rgi->worker_error)
+        {
+          slave_output_error_info(rgi, thd);
+          signal_error_to_sql_driver_thread(thd, rgi, err);
+        }
+        thd->reset_killed();
+      }
+      if (end_of_group)
+      {
+        in_event_group= false;
+        finish_event_group(rpt, event_gtid_sub_id, entry, rgi);
+        rpt->loc_free_rgi(rgi);
+        thd->rgi_slave= group_rgi= rgi= NULL;
+        skip_event_group= false;
+        DEBUG_SYNC(thd, "rpl_parallel_end_of_group");
+      }
+    }
+
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    /*
+      Now that we have the lock, we can move everything from our local free
+      lists to the real free lists that are also accessible from the SQL
+      driver thread.
+    */
+    rpt->batch_free();
+
+    if ((events= rpt->event_queue) != NULL)
+    {
+      /*
+        Take next group of events from the replication pool.
+        This is faster than having to wakeup the pool manager thread to give
+        us a new event.
+      */
+      rpt->dequeue1(events);
+      mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+      goto more_events;
+    }
+
+    rpt->inuse_relaylog_refcount_update();
+
+    if (in_event_group && group_rgi->parallel_entry->force_abort)
+    {
+      /*
+        We are asked to abort, without getting the remaining events in the
+        current event group.
+
+        We have to rollback the current transaction and update the last
+        sub_id value so that SQL thread will know we are done with the
+        half-processed event group.
+      */
+      mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+      signal_error_to_sql_driver_thread(thd, group_rgi, 1);
+      finish_event_group(rpt, group_rgi->gtid_sub_id,
+                         group_rgi->parallel_entry, group_rgi);
+      in_event_group= false;
+      mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+      rpt->free_rgi(group_rgi);
+      thd->rgi_slave= group_rgi= NULL;
+      skip_event_group= false;
+    }
+    if (!in_event_group)
+    {
+      /* If we are in a FLUSH TABLES FOR READ LOCK, wait for it */
+      while (rpt->current_entry && rpt->pause_for_ftwrl)
+      {
+        /*
+          We are currently in the delicate process of pausing parallel
+          replication while FLUSH TABLES WITH READ LOCK is starting. We must
+          not de-allocate the thread (setting rpt->current_owner= NULL) until
+          rpl_unpause_after_ftwrl() has woken us up.
+        */
+        rpl_parallel_entry *e= rpt->current_entry;
+        /*
+          Wait for rpl_unpause_after_ftwrl() to wake us up.
+          Note that rpl_pause_for_ftwrl() may wait for 'e->pause_sub_id'
+          to change. This should happen eventually in finish_event_group()
+        */
+        mysql_mutex_lock(&e->LOCK_parallel_entry);
+        mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+        if (rpt->pause_for_ftwrl)
+          mysql_cond_wait(&e->COND_parallel_entry, &e->LOCK_parallel_entry);
+        mysql_mutex_unlock(&e->LOCK_parallel_entry);
+        mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+      }
+
+      rpt->current_owner= NULL;
+      /* Tell wait_for_done() that we are done, if it is waiting. */
+      if (likely(rpt->current_entry) &&
+          unlikely(rpt->current_entry->force_abort))
+        mysql_cond_broadcast(&rpt->COND_rpl_thread_stop);
+
+      rpt->current_entry= NULL;
+      if (!rpt->stop)
+        rpt->pool->release_thread(rpt);
+    }
+  }
+#ifdef WITH_WSREP
+  wsrep_after_command_before_result(thd);
+  wsrep_after_command_after_result(thd);
+  wsrep_close(thd);
+#endif /* WITH_WSREP */
+
+  rpt->thd= NULL;
+  mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+
+  thd->clear_error();
+  thd->catalog= 0;
+  thd->reset_query();
+  thd->reset_db(&null_clex_str);
+  thd_proc_info(thd, "Slave worker thread exiting");
+  thd->temporary_tables= 0;
+
+  THD_CHECK_SENTRY(thd);
+  server_threads.erase(thd);
+  delete thd;
+
+  mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+  rpt->running= false;
+  mysql_cond_signal(&rpt->COND_rpl_thread);
+  mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+
+  my_thread_end();
+
+  return NULL;
+}
+
+
+static void
+dealloc_gco(group_commit_orderer *gco)
+{
+  mysql_cond_destroy(&gco->COND_group_commit_orderer);
+  my_free(gco);
+}
+
+/**
+   Change thread count for global parallel worker threads
+
+   @param pool          parallel thread pool
+   @param new_count     Number of threads to be in pool. 0 in shutdown
+   @param force         Force thread count to new_count even if slave
+                        threads are running
+
+   By default we don't resize pool of there are running threads.
+   However during shutdown we will always do it.
+   This is needed as any_slave_sql_running() returns 1 during shutdown
+   as we don't want to access master_info while
+   Master_info_index::free_connections are running.
+*/
+
+static int
+rpl_parallel_change_thread_count(rpl_parallel_thread_pool *pool,
+                                 uint32 new_count, bool force)
+{
+  uint32 i;
+  rpl_parallel_thread **old_list= NULL;
+  rpl_parallel_thread **new_list= NULL;
+  rpl_parallel_thread *new_free_list= NULL;
+  rpl_parallel_thread *rpt_array= NULL;
+  int res;
+
+  if ((res= pool_mark_busy(pool, current_thd)))
+    return res;
+
+  /* Protect against parallel pool resizes */
+  if (pool->count == new_count)
+  {
+    pool_mark_not_busy(pool);
+    return 0;
+  }
+
+  /*
+    If we are about to delete pool, do an extra check that there are no new
+    slave threads running since we marked pool busy
+  */
+  if (!new_count && !force)
+  {
+    if (any_slave_sql_running(false))
+    {
+      DBUG_PRINT("warning",
+                 ("SQL threads running while trying to reset parallel pool"));
+      pool_mark_not_busy(pool);
+      return 0;                                 // Ok to not resize pool
+    }
+  }
+
+  /*
+    Allocate the new list of threads up-front.
+    That way, if we fail half-way, we only need to free whatever we managed
+    to allocate, and will not be left with a half-functional thread pool.
+  */
+  if (new_count &&
+      !my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME|MY_ZEROFILL),
+                       &new_list, new_count*sizeof(*new_list),
+                       &rpt_array, new_count*sizeof(*rpt_array),
+                       NULL))
+  {
+    my_error(ER_OUTOFMEMORY, MYF(0), (int(new_count*sizeof(*new_list) +
+                                          new_count*sizeof(*rpt_array))));
+    goto err;
+  }
+
+  for (i= 0; i < new_count; ++i)
+  {
+    pthread_t th;
+
+    new_list[i]= &rpt_array[i];
+    new_list[i]->delay_start= true;
+    mysql_mutex_init(key_LOCK_rpl_thread, &new_list[i]->LOCK_rpl_thread,
+                     MY_MUTEX_INIT_SLOW);
+    mysql_cond_init(key_COND_rpl_thread, &new_list[i]->COND_rpl_thread, NULL);
+    mysql_cond_init(key_COND_rpl_thread_queue,
+                    &new_list[i]->COND_rpl_thread_queue, NULL);
+    mysql_cond_init(key_COND_rpl_thread_stop,
+                    &new_list[i]->COND_rpl_thread_stop, NULL);
+    new_list[i]->pool= pool;
+    if (mysql_thread_create(key_rpl_parallel_thread, &th, &connection_attrib,
+                            handle_rpl_parallel_thread, new_list[i]))
+    {
+      my_error(ER_OUT_OF_RESOURCES, MYF(0));
+      goto err;
+    }
+    new_list[i]->next= new_free_list;
+    new_free_list= new_list[i];
+  }
+
+  /*
+    Grab each old thread in turn, and signal it to stop.
+
+    Note that since we require all replication threads to be stopped before
+    changing the parallel replication worker thread pool, all the threads will
+    be already idle and will terminate immediately.
+  */
+  for (i= 0; i < pool->count; ++i)
+  {
+    rpl_parallel_thread *rpt;
+
+    mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
+    while ((rpt= pool->free_list) == NULL)
+      mysql_cond_wait(&pool->COND_rpl_thread_pool, &pool->LOCK_rpl_thread_pool);
+    pool->free_list= rpt->next;
+    mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    rpt->stop= true;
+    mysql_cond_signal(&rpt->COND_rpl_thread);
+    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+  }
+
+  for (i= 0; i < pool->count; ++i)
+  {
+    rpl_parallel_thread *rpt= pool->threads[i];
+    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+    while (rpt->running)
+      mysql_cond_wait(&rpt->COND_rpl_thread, &rpt->LOCK_rpl_thread);
+    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+    mysql_mutex_destroy(&rpt->LOCK_rpl_thread);
+    mysql_cond_destroy(&rpt->COND_rpl_thread);
+    while (rpt->qev_free_list)
+    {
+      rpl_parallel_thread::queued_event *next= rpt->qev_free_list->next;
+      my_free(rpt->qev_free_list);
+      rpt->qev_free_list= next;
+    }
+    while (rpt->rgi_free_list)
+    {
+      rpl_group_info *next= rpt->rgi_free_list->next;
+      delete rpt->rgi_free_list;
+      rpt->rgi_free_list= next;
+    }
+    while (rpt->gco_free_list)
+    {
+      group_commit_orderer *next= rpt->gco_free_list->next_gco;
+      dealloc_gco(rpt->gco_free_list);
+      rpt->gco_free_list= next;
+    }
+  }
+
+  old_list= pool->threads;
+  if (new_count < pool->count)
+    pool->count= new_count;
+  pool->threads= new_list;
+  if (new_count > pool->count)
+    pool->count= new_count;
+  my_free(old_list);
+  pool->free_list= new_free_list;
+  for (i= 0; i < pool->count; ++i)
+  {
+    mysql_mutex_lock(&pool->threads[i]->LOCK_rpl_thread);
+    pool->threads[i]->delay_start= false;
+    mysql_cond_signal(&pool->threads[i]->COND_rpl_thread);
+    while (!pool->threads[i]->running)
+      mysql_cond_wait(&pool->threads[i]->COND_rpl_thread,
+                      &pool->threads[i]->LOCK_rpl_thread);
+    mysql_mutex_unlock(&pool->threads[i]->LOCK_rpl_thread);
+  }
+
+  pool_mark_not_busy(pool);
+
+  return 0;
+
+err:
+  if (new_list)
+  {
+    while (new_free_list)
+    {
+      mysql_mutex_lock(&new_free_list->LOCK_rpl_thread);
+      new_free_list->delay_start= false;
+      new_free_list->stop= true;
+      mysql_cond_signal(&new_free_list->COND_rpl_thread);
+      while (!new_free_list->running)
+        mysql_cond_wait(&new_free_list->COND_rpl_thread,
+                        &new_free_list->LOCK_rpl_thread);
+      while (new_free_list->running)
+        mysql_cond_wait(&new_free_list->COND_rpl_thread,
+                        &new_free_list->LOCK_rpl_thread);
+      mysql_mutex_unlock(&new_free_list->LOCK_rpl_thread);
+      new_free_list= new_free_list->next;
+    }
+    my_free(new_list);
+  }
+  pool_mark_not_busy(pool);
+  return 1;
+}
+
+/*
+  Deactivate the parallel replication thread pool, if there are now no more
+  SQL threads running.
+*/
+
+int rpl_parallel_resize_pool_if_no_slaves(void)
+{
+  /* master_info_index is set to NULL on shutdown */
+  if (opt_slave_parallel_threads > 0 && !any_slave_sql_running(false))
+    return rpl_parallel_inactivate_pool(&global_rpl_thread_pool);
+  return 0;
+}
+
+
+/**
+  Pool activation is preceeded by taking a "lock" of pool_mark_busy
+  which guarantees the number of running slaves drops to zero atomicly
+  with the number of pool workers.
+  This resolves race between the function caller thread and one
+  that may be attempting to deactivate the pool.
+*/
+int
+rpl_parallel_activate_pool(rpl_parallel_thread_pool *pool)
+{
+  int rc= 0;
+
+  if ((rc= pool_mark_busy(pool, current_thd)))
+    return rc;   // killed
+
+  if (!pool->count)
+  {
+    pool_mark_not_busy(pool);
+    rc= rpl_parallel_change_thread_count(pool, opt_slave_parallel_threads,
+                                         0);
+  }
+  else
+  {
+    pool_mark_not_busy(pool);
+  }
+  return rc;
+}
+
+
+int
+rpl_parallel_inactivate_pool(rpl_parallel_thread_pool *pool)
+{
+  return rpl_parallel_change_thread_count(pool, 0, 0);
+}
+
+
+void
+rpl_parallel_thread::batch_free()
+{
+  mysql_mutex_assert_owner(&LOCK_rpl_thread);
+  if (loc_qev_list)
+  {
+    *loc_qev_last_ptr_ptr= qev_free_list;
+    qev_free_list= loc_qev_list;
+    loc_qev_list= NULL;
+    dequeue2(loc_qev_size);
+    /* Signal that our queue can now accept more events. */
+    mysql_cond_signal(&COND_rpl_thread_queue);
+    loc_qev_size= 0;
+    qev_free_pending= 0;
+  }
+  if (loc_rgi_list)
+  {
+    *loc_rgi_last_ptr_ptr= rgi_free_list;
+    rgi_free_list= loc_rgi_list;
+    loc_rgi_list= NULL;
+  }
+  if (loc_gco_list)
+  {
+    *loc_gco_last_ptr_ptr= gco_free_list;
+    gco_free_list= loc_gco_list;
+    loc_gco_list= NULL;
+  }
+}
+
+
+void
+rpl_parallel_thread::inuse_relaylog_refcount_update()
+{
+  inuse_relaylog *ir= accumulated_ir_last;
+  if (ir)
+  {
+    ir->dequeued_count+= accumulated_ir_count;
+    accumulated_ir_count= 0;
+    accumulated_ir_last= NULL;
+  }
+}
+
+
+rpl_parallel_thread::queued_event *
+rpl_parallel_thread::get_qev_common(Log_event *ev, ulonglong event_size)
+{
+  queued_event *qev;
+  mysql_mutex_assert_owner(&LOCK_rpl_thread);
+  if ((qev= qev_free_list))
+    qev_free_list= qev->next;
+  else if(!(qev= (queued_event *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*qev), MYF(0))))
+  {
+    my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(*qev));
+    return NULL;
+  }
+  qev->typ= rpl_parallel_thread::queued_event::QUEUED_EVENT;
+  qev->ev= ev;
+  qev->event_size= (size_t)event_size;
+  qev->next= NULL;
+  return qev;
+}
+
+
+rpl_parallel_thread::queued_event *
+rpl_parallel_thread::get_qev(Log_event *ev, ulonglong event_size,
+                             Relay_log_info *rli)
+{
+  queued_event *qev= get_qev_common(ev, event_size);
+  if (!qev)
+    return NULL;
+  strcpy(qev->event_relay_log_name, rli->event_relay_log_name);
+  qev->event_relay_log_pos= rli->event_relay_log_pos;
+  qev->future_event_relay_log_pos= rli->future_event_relay_log_pos;
+  strcpy(qev->future_event_master_log_name, rli->future_event_master_log_name);
+  return qev;
+}
+
+
+rpl_parallel_thread::queued_event *
+rpl_parallel_thread::retry_get_qev(Log_event *ev, queued_event *orig_qev,
+                                   const char *relay_log_name,
+                                   ulonglong event_pos, ulonglong event_size)
+{
+  queued_event *qev= get_qev_common(ev, event_size);
+  if (!qev)
+    return NULL;
+  qev->rgi= orig_qev->rgi;
+  strcpy(qev->event_relay_log_name, relay_log_name);
+  qev->event_relay_log_pos= event_pos;
+  qev->future_event_relay_log_pos= event_pos+event_size;
+  strcpy(qev->future_event_master_log_name,
+         orig_qev->future_event_master_log_name);
+  return qev;
+}
+
+
+void
+rpl_parallel_thread::loc_free_qev(rpl_parallel_thread::queued_event *qev)
+{
+  inuse_relaylog *ir= qev->ir;
+  inuse_relaylog *last_ir= accumulated_ir_last;
+  if (ir != last_ir)
+  {
+    if (last_ir)
+      inuse_relaylog_refcount_update();
+    accumulated_ir_last= ir;
+  }
+  ++accumulated_ir_count;
+  if (!loc_qev_list)
+    loc_qev_last_ptr_ptr= &qev->next;
+  else
+    qev->next= loc_qev_list;
+  loc_qev_list= qev;
+  loc_qev_size+= qev->event_size;
+  /*
+    We want to release to the global free list only occasionally, to avoid
+    having to take the LOCK_rpl_thread muted too many times.
+
+    However, we do need to release regularly. If we let the unreleased part
+    grow too large, then the SQL driver thread may go to sleep waiting for
+    the queue to drop below opt_slave_parallel_max_queued, and this in turn
+    can stall all other worker threads for more stuff to do.
+  */
+  if (++qev_free_pending >= QEV_BATCH_FREE ||
+      loc_qev_size >= opt_slave_parallel_max_queued/3)
+  {
+    mysql_mutex_lock(&LOCK_rpl_thread);
+    batch_free();
+    mysql_mutex_unlock(&LOCK_rpl_thread);
+  }
+}
+
+
+void
+rpl_parallel_thread::free_qev(rpl_parallel_thread::queued_event *qev)
+{
+  mysql_mutex_assert_owner(&LOCK_rpl_thread);
+  qev->next= qev_free_list;
+  qev_free_list= qev;
+}
+
+
+rpl_group_info*
+rpl_parallel_thread::get_rgi(Relay_log_info *rli, Gtid_log_event *gtid_ev,
+                             rpl_parallel_entry *e, ulonglong event_size)
+{
+  rpl_group_info *rgi;
+  mysql_mutex_assert_owner(&LOCK_rpl_thread);
+  if ((rgi= rgi_free_list))
+  {
+    rgi_free_list= rgi->next;
+    rgi->reinit(rli);
+  }
+  else
+  {
+    if(!(rgi= new rpl_group_info(rli)))
+    {
+      my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(*rgi));
+      return NULL;
+    }
+    rgi->is_parallel_exec = true;
+  }
+  if ((rgi->deferred_events_collecting= rli->mi->rpl_filter->is_on()) &&
+      !rgi->deferred_events)
+    rgi->deferred_events= new Deferred_log_events(rli);
+  if (event_group_new_gtid(rgi, gtid_ev))
+  {
+    free_rgi(rgi);
+    my_error(ER_OUT_OF_RESOURCES, MYF(MY_WME));
+    return NULL;
+  }
+  rgi->parallel_entry= e;
+  rgi->relay_log= rli->last_inuse_relaylog;
+  rgi->retry_start_offset= rli->future_event_relay_log_pos-event_size;
+  rgi->retry_event_count= 0;
+  rgi->killed_for_retry= rpl_group_info::RETRY_KILL_NONE;
+
+  return rgi;
+}
+
+
+void
+rpl_parallel_thread::loc_free_rgi(rpl_group_info *rgi)
+{
+  DBUG_ASSERT(rgi->commit_orderer.waitee == NULL);
+  rgi->free_annotate_event();
+  if (!loc_rgi_list)
+    loc_rgi_last_ptr_ptr= &rgi->next;
+  else
+    rgi->next= loc_rgi_list;
+  loc_rgi_list= rgi;
+}
+
+
+void
+rpl_parallel_thread::free_rgi(rpl_group_info *rgi)
+{
+  mysql_mutex_assert_owner(&LOCK_rpl_thread);
+  DBUG_ASSERT(rgi->commit_orderer.waitee == NULL);
+  rgi->free_annotate_event();
+  rgi->next= rgi_free_list;
+  rgi_free_list= rgi;
+}
+
+
+group_commit_orderer *
+rpl_parallel_thread::get_gco(uint64 wait_count, group_commit_orderer *prev,
+                             uint64 prior_sub_id)
+{
+  group_commit_orderer *gco;
+  mysql_mutex_assert_owner(&LOCK_rpl_thread);
+  if ((gco= gco_free_list))
+    gco_free_list= gco->next_gco;
+  else if(!(gco= (group_commit_orderer *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*gco), MYF(0))))
+  {
+    my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(*gco));
+    return NULL;
+  }
+  mysql_cond_init(key_COND_group_commit_orderer,
+                  &gco->COND_group_commit_orderer, NULL);
+  gco->wait_count= wait_count;
+  gco->prev_gco= prev;
+  gco->next_gco= NULL;
+  gco->prior_sub_id= prior_sub_id;
+  gco->installed= false;
+  gco->flags= 0;
+  return gco;
+}
+
+
+void
+rpl_parallel_thread::loc_free_gco(group_commit_orderer *gco)
+{
+  if (!loc_gco_list)
+    loc_gco_last_ptr_ptr= &gco->next_gco;
+  else
+    gco->next_gco= loc_gco_list;
+  loc_gco_list= gco;
+}
+
+
+rpl_parallel_thread_pool::rpl_parallel_thread_pool()
+  : threads(0), free_list(0), count(0), inited(false), busy(false)
+{
+}
+
+
+int
+rpl_parallel_thread_pool::init(uint32 size)
+{
+  threads= NULL;
+  free_list= NULL;
+  count= 0;
+  busy= false;
+
+  mysql_mutex_init(key_LOCK_rpl_thread_pool, &LOCK_rpl_thread_pool,
+                   MY_MUTEX_INIT_SLOW);
+  mysql_cond_init(key_COND_rpl_thread_pool, &COND_rpl_thread_pool, NULL);
+  inited= true;
+
+  /*
+    The pool is initially empty. Threads will be spawned when a slave SQL
+    thread is started.
+  */
+
+  return 0;
+}
+
+
+void
+rpl_parallel_thread_pool::destroy()
+{
+  deactivate();
+  destroy_cond_mutex();
+}
+
+void
+rpl_parallel_thread_pool::deactivate()
+{
+  if (!inited)
+    return;
+  rpl_parallel_change_thread_count(this, 0, 1);
+}
+
+void
+rpl_parallel_thread_pool::destroy_cond_mutex()
+{
+  if (!inited)
+    return;
+  mysql_mutex_destroy(&LOCK_rpl_thread_pool);
+  mysql_cond_destroy(&COND_rpl_thread_pool);
+  inited= false;
+}
+
+
+/*
+  Wait for a worker thread to become idle. When one does, grab the thread for
+  our use and return it.
+
+  Note that we return with the worker threads's LOCK_rpl_thread mutex locked.
+*/
+struct rpl_parallel_thread *
+rpl_parallel_thread_pool::get_thread(rpl_parallel_thread **owner,
+                                     rpl_parallel_entry *entry)
+{
+  rpl_parallel_thread *rpt;
+
+  DBUG_ASSERT(count > 0);
+  mysql_mutex_lock(&LOCK_rpl_thread_pool);
+  while (unlikely(busy) || !(rpt= free_list))
+    mysql_cond_wait(&COND_rpl_thread_pool, &LOCK_rpl_thread_pool);
+  free_list= rpt->next;
+  mysql_mutex_unlock(&LOCK_rpl_thread_pool);
+  mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+  rpt->current_owner= owner;
+  rpt->current_entry= entry;
+
+  return rpt;
+}
+
+
+/*
+  Release a thread to the thread pool.
+  The thread should be locked, and should not have any work queued for it.
+*/
+void
+rpl_parallel_thread_pool::release_thread(rpl_parallel_thread *rpt)
+{
+  rpl_parallel_thread *list;
+
+  mysql_mutex_assert_owner(&rpt->LOCK_rpl_thread);
+  DBUG_ASSERT(rpt->current_owner == NULL);
+  mysql_mutex_lock(&LOCK_rpl_thread_pool);
+  list= free_list;
+  rpt->next= list;
+  free_list= rpt;
+  if (!list)
+    mysql_cond_broadcast(&COND_rpl_thread_pool);
+  mysql_mutex_unlock(&LOCK_rpl_thread_pool);
+}
+
+
+/*
+  Obtain a worker thread that we can queue an event to.
+
+  Each invocation allocates a new worker thread, to maximise
+  parallelism. However, only up to a maximum of
+  --slave-domain-parallel-threads workers can be occupied by a single
+  replication domain; after that point, we start re-using worker threads that
+  are still executing events that were queued earlier for this thread.
+
+  We never queue more than --rpl-parallel-wait-queue_max amount of events
+  for one worker, to avoid the SQL driver thread using up all memory with
+  queued events while worker threads are stalling.
+
+  Note that this function returns with rpl_parallel_thread::LOCK_rpl_thread
+  locked. Exception is if we were killed, in which case NULL is returned.
+
+  The *did_enter_cond flag is set true if we had to wait for a worker thread
+  to become free (with mysql_cond_wait()). If so, old_stage will also be set,
+  and the LOCK_rpl_thread must be released with THD::EXIT_COND() instead
+  of mysql_mutex_unlock.
+
+  When `gtid_ev' is not NULL the last worker thread will be returned again,
+  if it is still available. Otherwise a new worker thread is allocated.
+
+  A worker for XA transaction is determined through xid hashing which
+  ensure for a XA-complete to be scheduled to the same-xid XA-prepare worker.
+*/
+rpl_parallel_thread *
+rpl_parallel_entry::choose_thread(rpl_group_info *rgi, bool *did_enter_cond,
+                                  PSI_stage_info *old_stage,
+                                  Gtid_log_event *gtid_ev)
+{
+  uint32 idx;
+  Relay_log_info *rli= rgi->rli;
+  rpl_parallel_thread *thr;
+
+  idx= rpl_thread_idx;
+  if (gtid_ev)
+  {
+    if (gtid_ev->flags2 &
+        (Gtid_log_event::FL_COMPLETED_XA | Gtid_log_event::FL_PREPARED_XA))
+      idx= my_hash_sort(&my_charset_bin, gtid_ev->xid.key(),
+                        gtid_ev->xid.key_length()) % rpl_thread_max;
+    else
+    {
+      ++idx;
+      if (idx >= rpl_thread_max)
+        idx= 0;
+    }
+    rpl_thread_idx= idx;
+  }
+  thr= rpl_threads[idx];
+  if (thr)
+  {
+    *did_enter_cond= false;
+    mysql_mutex_lock(&thr->LOCK_rpl_thread);
+    for (;;)
+    {
+      if (thr->current_owner != &rpl_threads[idx])
+      {
+        /*
+          The worker thread became idle, and returned to the free list and
+          possibly was allocated to a different request. So we should allocate
+          a new worker thread.
+        */
+        unlock_or_exit_cond(rli->sql_driver_thd, &thr->LOCK_rpl_thread,
+                            did_enter_cond, old_stage);
+        thr= NULL;
+        break;
+      }
+      else if (thr->queued_size <= opt_slave_parallel_max_queued)
+      {
+        /* The thread is ready to queue into. */
+        break;
+      }
+      else if (unlikely(rli->sql_driver_thd->check_killed(1)))
+      {
+        unlock_or_exit_cond(rli->sql_driver_thd, &thr->LOCK_rpl_thread,
+                            did_enter_cond, old_stage);
+        my_error(ER_CONNECTION_KILLED, MYF(0));
+        DBUG_EXECUTE_IF("rpl_parallel_wait_queue_max",
+          {
+            debug_sync_set_action(rli->sql_driver_thd,
+                      STRING_WITH_LEN("now SIGNAL wait_queue_killed"));
+          };);
+        slave_output_error_info(rgi, rli->sql_driver_thd);
+        return NULL;
+      }
+      else
+      {
+        /*
+          We have reached the limit of how much memory we are allowed to use
+          for queuing events, so wait for the thread to consume some of its
+          queue.
+        */
+        if (!*did_enter_cond)
+        {
+          /*
+            We need to do the debug_sync before ENTER_COND().
+            Because debug_sync changes the thd->mysys_var->current_mutex,
+            and this can cause THD::awake to use the wrong mutex.
+          */
+          DBUG_EXECUTE_IF("rpl_parallel_wait_queue_max",
+            {
+              debug_sync_set_action(rli->sql_driver_thd,
+                        STRING_WITH_LEN("now SIGNAL wait_queue_ready"));
+            };);
+          rli->sql_driver_thd->ENTER_COND(&thr->COND_rpl_thread_queue,
+                                          &thr->LOCK_rpl_thread,
+                                          &stage_waiting_for_room_in_worker_thread,
+                                          old_stage);
+          *did_enter_cond= true;
+        }
+        mysql_cond_wait(&thr->COND_rpl_thread_queue, &thr->LOCK_rpl_thread);
+      }
+    }
+  }
+  if (!thr)
+    rpl_threads[idx]= thr= global_rpl_thread_pool.get_thread(&rpl_threads[idx],
+                                                             this);
+
+  return thr;
+}
+
+static void
+free_rpl_parallel_entry(void *element)
+{
+  rpl_parallel_entry *e= (rpl_parallel_entry *)element;
+  while (e->current_gco)
+  {
+    group_commit_orderer *prev_gco= e->current_gco->prev_gco;
+    dealloc_gco(e->current_gco);
+    e->current_gco= prev_gco;
+  }
+  mysql_cond_destroy(&e->COND_parallel_entry);
+  mysql_mutex_destroy(&e->LOCK_parallel_entry);
+  my_free(e);
+}
+
+
+rpl_parallel::rpl_parallel() :
+  current(NULL), sql_thread_stopping(false)
+{
+  my_hash_init(PSI_INSTRUMENT_ME, &domain_hash, &my_charset_bin, 32,
+               offsetof(rpl_parallel_entry, domain_id), sizeof(uint32),
+               NULL, free_rpl_parallel_entry, HASH_UNIQUE);
+}
+
+
+void
+rpl_parallel::reset()
+{
+  my_hash_reset(&domain_hash);
+  current= NULL;
+  sql_thread_stopping= false;
+}
+
+
+rpl_parallel::~rpl_parallel()
+{
+  my_hash_free(&domain_hash);
+}
+
+
+rpl_parallel_entry *
+rpl_parallel::find(uint32 domain_id)
+{
+  struct rpl_parallel_entry *e;
+
+  if (!(e= (rpl_parallel_entry *)my_hash_search(&domain_hash,
+                                                (const uchar *)&domain_id, 0)))
+  {
+    /* Allocate a new, empty one. */
+    ulong count= opt_slave_domain_parallel_threads;
+    if (count == 0 || count > opt_slave_parallel_threads)
+      count= opt_slave_parallel_threads;
+    rpl_parallel_thread **p;
+    if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME|MY_ZEROFILL),
+                         &e, sizeof(*e),
+                         &p, count*sizeof(*p),
+                         NULL))
+    {
+      my_error(ER_OUTOFMEMORY, MYF(0), (int)(sizeof(*e)+count*sizeof(*p)));
+      return NULL;
+    }
+    e->rpl_threads= p;
+    e->rpl_thread_max= count;
+    e->domain_id= domain_id;
+    e->stop_on_error_sub_id= (uint64)ULONGLONG_MAX;
+    e->pause_sub_id= (uint64)ULONGLONG_MAX;
+    if (my_hash_insert(&domain_hash, (uchar *)e))
+    {
+      my_free(e);
+      return NULL;
+    }
+    mysql_mutex_init(key_LOCK_parallel_entry, &e->LOCK_parallel_entry,
+                     MY_MUTEX_INIT_FAST);
+    mysql_cond_init(key_COND_parallel_entry, &e->COND_parallel_entry, NULL);
+  }
+  else
+    e->force_abort= false;
+
+  return e;
+}
+
+/**
+  Wait until all sql worker threads has stopped processing
+
+  This is called when sql thread has been killed/stopped
+*/
+
+void
+rpl_parallel::wait_for_done(THD *thd, Relay_log_info *rli)
+{
+  struct rpl_parallel_entry *e;
+  rpl_parallel_thread *rpt;
+  uint32 i, j;
+
+  /*
+    First signal all workers that they must force quit; no more events will
+    be queued to complete any partial event groups executed.
+  */
+  for (i= 0; i < domain_hash.records; ++i)
+  {
+    e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
+    mysql_mutex_lock(&e->LOCK_parallel_entry);
+    /*
+      We want the worker threads to stop as quickly as is safe. If the slave
+      SQL threads are behind, we could have significant amount of events
+      queued for the workers, and we want to stop without waiting for them
+      all to be applied first. But if any event group has already started
+      executing in a worker, we want to be sure that all prior event groups
+      are also executed, so that we stop at a consistent point in the binlog
+      stream (per replication domain).
+
+      All event groups wait for e->count_committing_event_groups to reach
+      the value of group_commit_orderer::wait_count before starting to
+      execute. Thus, at this point we know that any event group with a
+      strictly larger wait_count are safe to skip, none of them can have
+      started executing yet. So we set e->stop_count here and use it to
+      decide in the worker threads whether to continue executing an event
+      group or whether to skip it, when force_abort is set.
+
+      If we stop due to reaching the START SLAVE UNTIL condition, then we
+      need to continue executing any queued events up to that point.
+    */
+    e->force_abort= true;
+    e->stop_count= rli->stop_for_until ?
+      e->count_queued_event_groups : e->count_committing_event_groups;
+    mysql_mutex_unlock(&e->LOCK_parallel_entry);
+    for (j= 0; j < e->rpl_thread_max; ++j)
+    {
+      if ((rpt= e->rpl_threads[j]))
+      {
+        mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+        if (rpt->current_owner == &e->rpl_threads[j])
+          mysql_cond_signal(&rpt->COND_rpl_thread);
+        mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+      }
+    }
+  }
+  DBUG_EXECUTE_IF("rpl_parallel_wait_for_done_trigger",
+  {
+    debug_sync_set_action(thd,
+                          STRING_WITH_LEN("now SIGNAL wait_for_done_waiting"));
+  };);
+
+  for (i= 0; i < domain_hash.records; ++i)
+  {
+    e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
+    for (j= 0; j < e->rpl_thread_max; ++j)
+    {
+      if ((rpt= e->rpl_threads[j]))
+      {
+        mysql_mutex_lock(&rpt->LOCK_rpl_thread);
+        while (rpt->current_owner == &e->rpl_threads[j])
+          mysql_cond_wait(&rpt->COND_rpl_thread_stop, &rpt->LOCK_rpl_thread);
+        mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
+      }
+    }
+  }
+}
+
+
+/*
+  This function handles the case where the SQL driver thread reached the
+  START SLAVE UNTIL position; we stop queueing more events but continue
+  processing remaining, already queued events; then use executes manual
+  STOP SLAVE; then this function signals to worker threads that they
+  should stop the processing of any remaining queued events.
+*/
+void
+rpl_parallel::stop_during_until()
+{
+  struct rpl_parallel_entry *e;
+  uint32 i;
+
+  for (i= 0; i < domain_hash.records; ++i)
+  {
+    e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
+    mysql_mutex_lock(&e->LOCK_parallel_entry);
+    if (e->force_abort)
+      e->stop_count= e->count_committing_event_groups;
+    mysql_mutex_unlock(&e->LOCK_parallel_entry);
+  }
+}
+
+
+bool
+rpl_parallel::workers_idle()
+{
+  struct rpl_parallel_entry *e;
+  uint32 i, max_i;
+
+  max_i= domain_hash.records;
+  for (i= 0; i < max_i; ++i)
+  {
+    bool active;
+    e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
+    mysql_mutex_lock(&e->LOCK_parallel_entry);
+    active= e->current_sub_id > e->last_committed_sub_id;
+    mysql_mutex_unlock(&e->LOCK_parallel_entry);
+    if (active)
+      break;
+  }
+  return (i == max_i);
+}
+
+
+int
+rpl_parallel_entry::queue_master_restart(rpl_group_info *rgi,
+                                         Format_description_log_event *fdev)
+{
+  uint32 idx;
+  rpl_parallel_thread *thr;
+  rpl_parallel_thread::queued_event *qev;
+  Relay_log_info *rli= rgi->rli;
+
+  /*
+    We only need to queue the server restart if we still have a thread working
+    on a (potentially partial) event group.
+
+    If the last thread we queued for has finished, then it cannot have any
+    partial event group that needs aborting.
+
+    Thus there is no need for the full complexity of choose_thread(). We only
+    need to check if we have a current worker thread, and queue for it if so.
+  */
+  idx= rpl_thread_idx;
+  thr= rpl_threads[idx];
+  if (!thr)
+    return 0;
+  mysql_mutex_lock(&thr->LOCK_rpl_thread);
+  if (thr->current_owner != &rpl_threads[idx])
+  {
+    /* No active worker thread, so no need to queue the master restart. */
+    mysql_mutex_unlock(&thr->LOCK_rpl_thread);
+    return 0;
+  }
+
+  if (!(qev= thr->get_qev(fdev, 0, rli)))
+  {
+    mysql_mutex_unlock(&thr->LOCK_rpl_thread);
+    return 1;
+  }
+
+  qev->rgi= rgi;
+  qev->typ= rpl_parallel_thread::queued_event::QUEUED_MASTER_RESTART;
+  qev->entry_for_queued= this;
+  qev->ir= rli->last_inuse_relaylog;
+  ++qev->ir->queued_count;
+  thr->enqueue(qev);
+  mysql_cond_signal(&thr->COND_rpl_thread);
+  mysql_mutex_unlock(&thr->LOCK_rpl_thread);
+  return 0;
+}
+
+
+int
+rpl_parallel::wait_for_workers_idle(THD *thd)
+{
+  uint32 i, max_i;
+
+  /*
+    The domain_hash is only accessed by the SQL driver thread, so it is safe
+    to iterate over without a lock.
+  */
+  max_i= domain_hash.records;
+  for (i= 0; i < max_i; ++i)
+  {
+    PSI_stage_info old_stage;
+    struct rpl_parallel_entry *e;
+    int err= 0;
+
+    e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
+    mysql_mutex_lock(&e->LOCK_parallel_entry);
+    ++e->need_sub_id_signal;
+    thd->ENTER_COND(&e->COND_parallel_entry, &e->LOCK_parallel_entry,
+                    &stage_waiting_for_workers_idle, &old_stage);
+    while (e->current_sub_id > e->last_committed_sub_id)
+    {
+      if (unlikely(thd->check_killed()))
+      {
+        err= 1;
+        break;
+      }
+      mysql_cond_wait(&e->COND_parallel_entry, &e->LOCK_parallel_entry);
+    }
+    --e->need_sub_id_signal;
+    thd->EXIT_COND(&old_stage);
+    if (err)
+      return err;
+  }
+  return 0;
+}
+
+
+/*
+  Handle seeing a GTID during slave restart in GTID mode. If we stopped with
+  different replication domains having reached different positions in the relay
+  log, we need to skip event groups in domains that are further progressed.
+
+  Updates the state with the seen GTID, and returns true if this GTID should
+  be skipped, false otherwise.
+*/
+bool
+process_gtid_for_restart_pos(Relay_log_info *rli, rpl_gtid *gtid)
+{
+  slave_connection_state::entry *gtid_entry;
+  slave_connection_state *state= &rli->restart_gtid_pos;
+
+  if (likely(state->count() == 0) ||
+      !(gtid_entry= state->find_entry(gtid->domain_id)))
+    return false;
+  if (gtid->server_id == gtid_entry->gtid.server_id)
+  {
+    uint64 seq_no= gtid_entry->gtid.seq_no;
+    if (gtid->seq_no >= seq_no)
+    {
+      /*
+        This domain has reached its start position. So remove it, so that
+        further events will be processed normally.
+      */
+      state->remove(&gtid_entry->gtid);
+    }
+    return gtid->seq_no <= seq_no;
+  }
+  else
+    return true;
+}
+
+
+/*
+  This is used when we get an error during processing in do_event();
+  We will not queue any event to the thread, but we still need to wake it up
+  to be sure that it will be returned to the pool.
+*/
+static void
+abandon_worker_thread(THD *thd, rpl_parallel_thread *cur_thread,
+                      bool *did_enter_cond, PSI_stage_info *old_stage)
+{
+  unlock_or_exit_cond(thd, &cur_thread->LOCK_rpl_thread,
+                      did_enter_cond, old_stage);
+  mysql_cond_signal(&cur_thread->COND_rpl_thread);
+}
+
+
+/*
+  do_event() is executed by the sql_driver_thd thread.
+  It's main purpose is to find a thread that can execute the query.
+
+  @retval  0    ok, event was accepted
+  @retval  1    error
+  @retval -1    event should be executed serially, in the sql driver thread
+*/
+
+int
+rpl_parallel::do_event(rpl_group_info *serial_rgi, Log_event *ev,
+                       ulonglong event_size)
+{
+  rpl_parallel_entry *e;
+  rpl_parallel_thread *cur_thread;
+  rpl_parallel_thread::queued_event *qev;
+  rpl_group_info *rgi= NULL;
+  Relay_log_info *rli= serial_rgi->rli;
+  enum Log_event_type typ;
+  bool is_group_event;
+  bool did_enter_cond= false;
+  PSI_stage_info old_stage;
+
+  DBUG_EXECUTE_IF("slave_crash_if_parallel_apply", DBUG_SUICIDE(););
+  /* Handle master log name change, seen in Rotate_log_event. */
+  typ= ev->get_type_code();
+  if (unlikely(typ == ROTATE_EVENT))
+  {
+    Rotate_log_event *rev= static_cast<Rotate_log_event *>(ev);
+    if ((rev->server_id != global_system_variables.server_id ||
+         rli->replicate_same_server_id) &&
+        !rev->is_relay_log_event() &&
+        !rli->is_in_group())
+    {
+      memcpy(rli->future_event_master_log_name,
+             rev->new_log_ident, rev->ident_len+1);
+      rli->notify_group_master_log_name_update();
+    }
+  }
+
+  /*
+    Execute queries non-parallel if slave_skip_counter is set, as it's is
+    easier to skip queries in single threaded mode.
+  */
+  if (rli->slave_skip_counter)
+    return -1;
+
+  /* Execute pre-10.0 event, which have no GTID, in single-threaded mode. */
+  is_group_event= Log_event::is_group_event(typ);
+  if (unlikely(!current) && typ != GTID_EVENT &&
+      !(unlikely(rli->gtid_skip_flag != GTID_SKIP_NOT) && is_group_event))
+    return -1;
+
+  /* Note: rli->data_lock is released by sql_delay_event(). */
+  if (sql_delay_event(ev, rli->sql_driver_thd, serial_rgi))
+  {
+    /*
+      If sql_delay_event() returns non-zero, it means that the wait timed out
+      due to slave stop. We should not queue the event in this case, it must
+      not be applied yet.
+    */
+    delete ev;
+    return 1;
+  }
+
+  if (unlikely(typ == FORMAT_DESCRIPTION_EVENT))
+  {
+    Format_description_log_event *fdev=
+      static_cast<Format_description_log_event *>(ev);
+    if (fdev->created)
+    {
+      /*
+        This format description event marks a new binlog after a master server
+        restart. We are going to close all temporary tables to clean up any
+        possible left-overs after a prior master crash.
+
+        Thus we need to wait for all prior events to execute to completion,
+        in case they need access to any of the temporary tables.
+
+        We also need to notify the worker thread running the prior incomplete
+        event group (if any), as such event group signifies an incompletely
+        written group cut short by a master crash, and must be rolled back.
+      */
+      if (current->queue_master_restart(serial_rgi, fdev) ||
+          wait_for_workers_idle(rli->sql_driver_thd))
+      {
+        delete ev;
+        return 1;
+      }
+    }
+  }
+  else if (unlikely(typ == GTID_LIST_EVENT))
+  {
+    Gtid_list_log_event *glev= static_cast<Gtid_list_log_event *>(ev);
+    rpl_gtid *list= glev->list;
+    uint32 count= glev->count;
+    rli->update_relay_log_state(list, count);
+    while (count)
+    {
+      process_gtid_for_restart_pos(rli, list);
+      ++list;
+      --count;
+    }
+  }
+
+  /*
+    Stop queueing additional event groups once the SQL thread is requested to
+    stop.
+
+    We have to queue any remaining events of any event group that has already
+    been partially queued, but after that we will just ignore any further
+    events the SQL driver thread may try to queue, and eventually it will stop.
+  */
+  if ((typ == GTID_EVENT || !is_group_event) && rli->abort_slave)
+    sql_thread_stopping= true;
+  if (sql_thread_stopping)
+  {
+    delete ev;
+    /*
+      Return "no error"; normal stop is not an error, and otherwise the error
+      has already been recorded.
+    */
+    return 0;
+  }
+
+  if (unlikely(rli->gtid_skip_flag != GTID_SKIP_NOT) && is_group_event)
+  {
+    if (typ == GTID_EVENT)
+      rli->gtid_skip_flag= GTID_SKIP_NOT;
+    else
+    {
+      if (rli->gtid_skip_flag == GTID_SKIP_STANDALONE)
+      {
+        if (!Log_event::is_part_of_group(typ))
+          rli->gtid_skip_flag= GTID_SKIP_NOT;
+      }
+      else
+      {
+        DBUG_ASSERT(rli->gtid_skip_flag == GTID_SKIP_TRANSACTION);
+        if (typ == XID_EVENT || typ == XA_PREPARE_LOG_EVENT ||
+            (typ == QUERY_EVENT &&  // COMMIT/ROLLBACK are never compressed
+             (((Query_log_event *)ev)->is_commit() ||
+              ((Query_log_event *)ev)->is_rollback())))
+          rli->gtid_skip_flag= GTID_SKIP_NOT;
+      }
+      delete_or_keep_event_post_apply(serial_rgi, typ, ev);
+      return 0;
+    }
+  }
+
+  Gtid_log_event *gtid_ev= NULL;
+  if (typ == GTID_EVENT)
+  {
+    rpl_gtid gtid;
+    gtid_ev= static_cast<Gtid_log_event *>(ev);
+    uint32 domain_id= (rli->mi->using_gtid == Master_info::USE_GTID_NO ||
+                       rli->mi->parallel_mode <= SLAVE_PARALLEL_MINIMAL ?
+                       0 : gtid_ev->domain_id);
+    if (!(e= find(domain_id)))
+    {
+      my_error(ER_OUT_OF_RESOURCES, MYF(MY_WME));
+      delete ev;
+      return 1;
+    }
+    current= e;
+
+    gtid.domain_id= gtid_ev->domain_id;
+    gtid.server_id= gtid_ev->server_id;
+    gtid.seq_no= gtid_ev->seq_no;
+    rli->update_relay_log_state(&gtid, 1);
+    if (process_gtid_for_restart_pos(rli, &gtid))
+    {
+      /*
+        This domain has progressed further into the relay log before the last
+        SQL thread restart. So we need to skip this event group to not doubly
+        apply it.
+      */
+      rli->gtid_skip_flag= ((gtid_ev->flags2 & Gtid_log_event::FL_STANDALONE) ?
+                            GTID_SKIP_STANDALONE : GTID_SKIP_TRANSACTION);
+      delete_or_keep_event_post_apply(serial_rgi, typ, ev);
+      return 0;
+    }
+  }
+  else
+    e= current;
+
+  /*
+    Find a worker thread to queue the event for.
+    Prefer a new thread, so we maximise parallelism (at least for the group
+    commit). But do not exceed a limit of --slave-domain-parallel-threads;
+    instead re-use a thread that we queued for previously.
+  */
+  cur_thread=
+    e->choose_thread(serial_rgi, &did_enter_cond, &old_stage, gtid_ev);
+  if (!cur_thread)
+  {
+    /* This means we were killed. The error is already signalled. */
+    delete ev;
+    return 1;
+  }
+
+  if (!(qev= cur_thread->get_qev(ev, event_size, rli)))
+  {
+    abandon_worker_thread(rli->sql_driver_thd, cur_thread,
+                          &did_enter_cond, &old_stage);
+    delete ev;
+    return 1;
+  }
+
+  if (typ == GTID_EVENT)
+  {
+    bool new_gco;
+    enum_slave_parallel_mode mode= rli->mi->parallel_mode;
+    uchar gtid_flags= gtid_ev->flags2;
+    group_commit_orderer *gco;
+    uint8 force_switch_flag;
+    enum rpl_group_info::enum_speculation speculation;
+
+    if (!(rgi= cur_thread->get_rgi(rli, gtid_ev, e, event_size)))
+    {
+      cur_thread->free_qev(qev);
+      abandon_worker_thread(rli->sql_driver_thd, cur_thread,
+                            &did_enter_cond, &old_stage);
+      delete ev;
+      return 1;
+    }
+
+    /*
+      We queue the event group in a new worker thread, to run in parallel
+      with previous groups.
+
+      To preserve commit order within the replication domain, we set up
+      rgi->wait_commit_sub_id to make the new group commit only after the
+      previous group has committed.
+
+      Event groups that group-committed together on the master can be run
+      in parallel with each other without restrictions. But one batch of
+      group-commits may not start before all groups in the previous batch
+      have initiated their commit phase; we set up rgi->gco to ensure that.
+    */
+    rgi->wait_commit_sub_id= e->current_sub_id;
+    rgi->wait_commit_group_info= e->current_group_info;
+
+    speculation= rpl_group_info::SPECULATE_NO;
+    new_gco= true;
+    force_switch_flag= 0;
+    gco= e->current_gco;
+    if (likely(gco))
+    {
+      uint8 flags= gco->flags;
+
+      if (mode <= SLAVE_PARALLEL_MINIMAL ||
+          !(gtid_flags & Gtid_log_event::FL_GROUP_COMMIT_ID) ||
+          e->last_commit_id != gtid_ev->commit_id)
+        flags|= group_commit_orderer::MULTI_BATCH;
+      /* Make sure we do not attempt to run DDL in parallel speculatively. */
+      if (gtid_flags & Gtid_log_event::FL_DDL)
+        flags|= (force_switch_flag= group_commit_orderer::FORCE_SWITCH);
+
+      if (!(flags & group_commit_orderer::MULTI_BATCH))
+      {
+        /*
+          Still the same batch of event groups that group-committed together
+          on the master, so we can run in parallel.
+        */
+        new_gco= false;
+      }
+      else if ((mode >= SLAVE_PARALLEL_OPTIMISTIC) &&
+               !(flags & group_commit_orderer::FORCE_SWITCH))
+      {
+        /*
+          In transactional parallel mode, we optimistically attempt to run
+          non-DDL in parallel. In case of conflicts, we catch the conflict as
+          a deadlock or other error, roll back and retry serially.
+
+          The assumption is that only a few event groups will be
+          non-transactional or otherwise unsuitable for parallel apply. Those
+          transactions are still scheduled in parallel, but we set a flag that
+          will make the worker thread wait for everything before to complete
+          before starting.
+        */
+        new_gco= false;
+        if (!(gtid_flags & Gtid_log_event::FL_TRANSACTIONAL) ||
+            ( (!(gtid_flags & Gtid_log_event::FL_ALLOW_PARALLEL) ||
+               (gtid_flags & Gtid_log_event::FL_WAITED)) &&
+              (mode < SLAVE_PARALLEL_AGGRESSIVE)))
+        {
+          /*
+            This transaction should not be speculatively run in parallel with
+            what came before, either because it cannot safely be rolled back in
+            case of a conflict, or because it was marked as likely to conflict
+            and require expensive rollback and retry.
+
+            Here we mark it as such, and then the worker thread will do a
+            wait_for_prior_commit() before starting it. We do not introduce a
+            new group_commit_orderer, since we still want following transactions
+            to run in parallel with transactions prior to this one.
+          */
+          speculation= rpl_group_info::SPECULATE_WAIT;
+        }
+        else
+          speculation= rpl_group_info::SPECULATE_OPTIMISTIC;
+      }
+      gco->flags= flags;
+    }
+    else
+    {
+      if (gtid_flags & Gtid_log_event::FL_DDL)
+        force_switch_flag= group_commit_orderer::FORCE_SWITCH;
+    }
+    rgi->speculation= speculation;
+
+    if (gtid_flags & Gtid_log_event::FL_GROUP_COMMIT_ID)
+      e->last_commit_id= gtid_ev->commit_id;
+    else
+      e->last_commit_id= 0;
+
+    if (new_gco)
+    {
+      /*
+        Do not run this event group in parallel with what came before; instead
+        wait for everything prior to at least have started its commit phase, to
+        avoid any risk of performing any conflicting action too early.
+
+        Remember the count that marks the end of the previous batch of event
+        groups that run in parallel, and allocate a new gco.
+      */
+      uint64 count= e->count_queued_event_groups;
+
+      if (!(gco= cur_thread->get_gco(count, gco, e->current_sub_id)))
+      {
+        cur_thread->free_rgi(rgi);
+        cur_thread->free_qev(qev);
+        abandon_worker_thread(rli->sql_driver_thd, cur_thread,
+                              &did_enter_cond, &old_stage);
+        delete ev;
+        return 1;
+      }
+      gco->flags|= force_switch_flag;
+      e->current_gco= gco;
+    }
+    rgi->gco= gco;
+
+    qev->rgi= e->current_group_info= rgi;
+    e->current_sub_id= rgi->gtid_sub_id;
+    ++e->count_queued_event_groups;
+  }
+  else if (!is_group_event)
+  {
+    int err;
+    bool tmp;
+    /*
+      Events like ROTATE and FORMAT_DESCRIPTION. Do not run in worker thread.
+      Same for events not preceeded by GTID (we should not see those normally,
+      but they might be from an old master).
+    */
+    qev->rgi= serial_rgi;
+
+    tmp= serial_rgi->is_parallel_exec;
+    serial_rgi->is_parallel_exec= true;
+    err= rpt_handle_event(qev, NULL);
+    serial_rgi->is_parallel_exec= tmp;
+    if (ev->is_relay_log_event())
+      qev->future_event_master_log_pos= 0;
+    else if (typ == ROTATE_EVENT)
+      qev->future_event_master_log_pos=
+        (static_cast<Rotate_log_event *>(ev))->pos;
+    else
+      qev->future_event_master_log_pos= ev->log_pos;
+    delete_or_keep_event_post_apply(serial_rgi, typ, ev);
+
+    if (err)
+    {
+      cur_thread->free_qev(qev);
+      abandon_worker_thread(rli->sql_driver_thd, cur_thread,
+                            &did_enter_cond, &old_stage);
+      return 1;
+    }
+    /*
+      Queue a position update, so that the position will be updated in a
+      reasonable way relative to other events:
+
+       - If the currently executing events are queued serially for a single
+         thread, the position will only be updated when everything before has
+         completed.
+
+       - If we are executing multiple independent events in parallel, then at
+         least the position will not be updated until one of them has reached
+         the current point.
+    */
+    qev->typ= rpl_parallel_thread::queued_event::QUEUED_POS_UPDATE;
+    qev->entry_for_queued= e;
+  }
+  else
+  {
+    qev->rgi= e->current_group_info;
+  }
+
+  /*
+    Queue the event for processing.
+  */
+  qev->ir= rli->last_inuse_relaylog;
+  ++qev->ir->queued_count;
+  cur_thread->enqueue(qev);
+  unlock_or_exit_cond(rli->sql_driver_thd, &cur_thread->LOCK_rpl_thread,
+                      &did_enter_cond, &old_stage);
+  mysql_cond_signal(&cur_thread->COND_rpl_thread);
+
+  return 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:07:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:07:14 +0000
commit	a175314c3e5827eb193872241446f2f8f5c9d33c (patch)
tree	cd3d60ca99ae00829c52a6ca79150a5b6e62528b /sql/rpl_parallel.cc
parent	Initial commit. (diff)
download	mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip