Adding upstream version 1:10.6.11.upstream/1%10.6.11 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:04:16 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 18:04:16 +0000
commit: a68fb2d8219f6bccc573009600e9f23e89226a5e (patch)
tree: d742d35d14ae816e99293d2b01face30e9f3a46b /storage/innobase/trx
parent: Initial commit. (diff)
download: mariadb-10.6-a68fb2d8219f6bccc573009600e9f23e89226a5e.tar.xz
mariadb-10.6-a68fb2d8219f6bccc573009600e9f23e89226a5e.zip
8 files changed, 11071 insertions, 0 deletions
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000..2dc39118
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1471 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#include "trx0i_s.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "sql_class.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM	1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE	39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd			\
+	 - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+	ulint	offset;		/*!< offset, in number of rows */
+	ulint	rows_allocd;	/*!< the size of this chunk, in number
+				of rows */
+	void*	base;		/*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+	ulint		rows_used;	/*!< number of used rows */
+	ulint		rows_allocd;	/*!< number of allocated rows */
+	ulint		row_size;	/*!< size of a single row */
+	i_s_mem_chunk_t	chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+					memory chunks that stores the
+					rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+	srw_lock rw_lock;		/*!< read-write lock protecting this */
+	Atomic_relaxed<ulonglong> last_read;
+					/*!< last time the cache was read;
+					measured in nanoseconds */
+	i_s_table_cache_t innodb_trx;	/*!< innodb_trx table */
+	i_s_table_cache_t innodb_locks;	/*!< innodb_locks table */
+	i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM		10000
+	hash_table_t	locks_hash;	/*!< hash table used to eliminate
+					duplicate entries in the
+					innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE	1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS	2048
+	ha_storage_t*	storage;	/*!< storage for external volatile
+					data that may become unavailable
+					when we release
+					lock_sys.latch */
+	ulint		mem_allocd;	/*!< the amount of memory
+					allocated with mem_alloc*() */
+	bool		is_truncated;	/*!< this is true if the memory
+					limit was hit and thus the data
+					in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t	trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
+
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
+{
+  return !lock->is_table()
+    ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+    : uint16_t{0xFFFF};
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+	i_s_table_cache_t*	table_cache,	/*!< out: table cache */
+	size_t			row_size)	/*!< in: the size of a
+						row */
+{
+	ulint	i;
+
+	table_cache->rows_used = 0;
+	table_cache->rows_allocd = 0;
+	table_cache->row_size = row_size;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		table_cache->chunks[i].base = NULL;
+	}
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+	i_s_table_cache_t*	table_cache)	/*!< in/out: table cache */
+{
+	ulint	i;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		if (table_cache->chunks[i].base) {
+			ut_free(table_cache->chunks[i].base);
+			table_cache->chunks[i].base = NULL;
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+	i_s_table_cache_t*	table_cache,	/*!< in/out: table cache */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache to record
+						how many bytes are
+						allocated */
+{
+	ulint	i;
+	void*	row;
+
+	ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+	if (table_cache->rows_used == table_cache->rows_allocd) {
+
+		/* rows_used == rows_allocd means that new chunk needs
+		to be allocated: either no more empty rows in the
+		last allocated chunk or nothing has been allocated yet
+		(rows_num == rows_allocd == 0); */
+
+		i_s_mem_chunk_t*	chunk;
+		ulint			req_bytes;
+		ulint			got_bytes;
+		ulint			req_rows;
+		ulint			got_rows;
+
+		/* find the first not allocated chunk */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].base == NULL) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		have been allocated :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		/* allocate the chunk we just found */
+
+		if (i == 0) {
+
+			/* first chunk, nothing is allocated yet */
+			req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+		} else {
+
+			/* Memory is increased by the formula
+			new = old + old / 2; We are trying not to be
+			aggressive here (= using the common new = old * 2)
+			because the allocated memory will not be freed
+			until InnoDB exit (it is reused). So it is better
+			to once allocate the memory in more steps, but
+			have less unused/wasted memory than to use less
+			steps in allocation (which is done once in a
+			lifetime) but end up with lots of unused/wasted
+			memory. */
+			req_rows = table_cache->rows_allocd / 2;
+		}
+		req_bytes = req_rows * table_cache->row_size;
+
+		if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+			return(NULL);
+		}
+
+		chunk = &table_cache->chunks[i];
+
+		got_bytes = req_bytes;
+		chunk->base = ut_malloc_nokey(req_bytes);
+
+		got_rows = got_bytes / table_cache->row_size;
+
+		cache->mem_allocd += got_bytes;
+
+#if 0
+		printf("allocating chunk %d req bytes=%lu, got bytes=%lu,"
+		       " row size=%lu,"
+		       " req rows=%lu, got rows=%lu\n",
+		       i, req_bytes, got_bytes,
+		       table_cache->row_size,
+		       req_rows, got_rows);
+#endif
+
+		chunk->rows_allocd = got_rows;
+
+		table_cache->rows_allocd += got_rows;
+
+		/* adjust the offset of the next chunk */
+		if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+			table_cache->chunks[i + 1].offset
+				= chunk->offset + chunk->rows_allocd;
+		}
+
+		/* return the first empty row in the newly allocated
+		chunk */
+		row = chunk->base;
+	} else {
+
+		char*	chunk_start;
+		ulint	offset;
+
+		/* there is an empty row, no need to allocate new
+		chunks */
+
+		/* find the first chunk that contains allocated but
+		empty/unused rows */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].offset
+			    + table_cache->chunks[i].rows_allocd
+			    > table_cache->rows_used) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		are full, but
+		table_cache->rows_used != table_cache->rows_allocd means
+		exactly the opposite - there are allocated but
+		empty/unused rows :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		chunk_start = (char*) table_cache->chunks[i].base;
+		offset = table_cache->rows_used
+			- table_cache->chunks[i].offset;
+
+		row = chunk_start + offset * table_cache->row_size;
+	}
+
+	table_cache->rows_used++;
+
+	return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+	const i_s_locks_row_t*	row)	/*!< in: row to validate */
+{
+	ut_ad(row->lock_mode);
+	ut_ad(row->lock_table != NULL);
+	ut_ad(row->lock_table_id != 0);
+
+	if (!row->lock_index) {
+		/* table lock */
+		ut_ad(!row->lock_data);
+		ut_ad(row->lock_page == page_id_t(0, 0));
+		ut_ad(!row->lock_rec);
+	} else {
+		/* record lock */
+		/* row->lock_data == NULL if buf_page_try_get() == NULL */
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+	i_s_trx_row_t*		row,		/*!< out: result object
+						that's filled */
+	const trx_t*		trx,		/*!< in: transaction to
+						get data from */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						corresponding row in
+						innodb_locks if trx is
+						waiting or NULL if trx
+						is not waiting */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache into
+						which to copy volatile
+						strings */
+{
+	const char*	s;
+
+	lock_sys.assert_locked();
+
+	const lock_t* wait_lock = trx->lock.wait_lock;
+
+	row->trx_id = trx->id;
+	row->trx_started = trx->start_time;
+	if (trx->in_rollback) {
+		row->trx_state = "ROLLING BACK";
+	} else if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+		row->trx_state = "COMMITTING";
+	} else if (wait_lock) {
+		row->trx_state = "LOCK WAIT";
+	} else {
+		row->trx_state = "RUNNING";
+	}
+
+	row->requested_lock_row = requested_lock_row;
+	ut_ad(requested_lock_row == NULL
+	      || i_s_locks_row_validate(requested_lock_row));
+
+	ut_ad(!wait_lock == !requested_lock_row);
+
+	const my_hrtime_t suspend_time= trx->lock.suspend_time;
+	row->trx_wait_started = wait_lock ? hrtime_to_time(suspend_time) : 0;
+
+	row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
+
+	if (trx->mysql_thd == NULL) {
+		/* For internal transactions e.g., purge and transactions
+		being recovered at startup there is no associated MySQL
+		thread data structure. */
+		row->trx_mysql_thread_id = 0;
+		row->trx_query = NULL;
+		goto thd_done;
+	}
+
+	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+	char	query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+	if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query,
+					     sizeof query)) {
+		row->trx_query = static_cast<const char*>(
+			ha_storage_put_memlim(
+				cache->storage, query, stmt_len + 1,
+				MAX_ALLOWED_FOR_STORAGE(cache)));
+
+		row->trx_query_cs = thd_charset(trx->mysql_thd);
+
+		if (row->trx_query == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+
+		row->trx_query = NULL;
+	}
+
+thd_done:
+	row->trx_operation_state = trx->op_info;
+
+	row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+	row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
+
+	/* These are protected by lock_sys.latch (which we are holding)
+	and sometimes also trx->mutex. */
+
+	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+	row->trx_rows_locked = trx->lock.n_rec_locks;
+
+	row->trx_rows_modified = trx->undo_no;
+
+	row->trx_isolation_level = trx->isolation_level;
+
+	row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+	row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+	s = trx->detailed_error;
+
+	if (s != NULL && s[0] != '\0') {
+
+		TRX_I_S_STRING_COPY(s,
+				    row->trx_foreign_key_error,
+				    TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+		if (row->trx_foreign_key_error == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+		row->trx_foreign_key_error = NULL;
+	}
+
+	row->trx_is_read_only = trx->read_only;
+
+	row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking();
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+	char*			buf,	/*!< out: buffer */
+	ulint			buf_size,/*!< in: buffer size in bytes */
+	ulint			n,	/*!< in: number of field */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record */
+	const rec_offs*		offsets)/*!< in: record offsets, returned
+					by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		data_len;
+	dict_field_t*	dict_field;
+	ulint		ret;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	ret = 0;
+
+	if (n > 0) {
+		/* we must append ", " before the actual data */
+
+		if (buf_size < 3) {
+
+			buf[0] = '\0';
+			return(1);
+		}
+
+		memcpy(buf, ", ", 3);
+
+		buf += 2;
+		buf_size -= 2;
+		ret += 2;
+	}
+
+	/* now buf_size >= 1 */
+
+	data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+	dict_field = dict_index_get_nth_field(index, n);
+
+	ret += row_raw_format((const char*) data, data_len,
+			      dict_field, buf, buf_size);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+	const char**		lock_data,/*!< out: "lock_data" to fill */
+	const lock_t*		lock,	/*!< in: lock used to find the data */
+	ulint			heap_no,/*!< in: rec num used to find the data */
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
+					volatile data */
+{
+	ut_a(!lock->is_table());
+
+	switch (heap_no) {
+	case PAGE_HEAP_NO_INFIMUM:
+	case PAGE_HEAP_NO_SUPREMUM:
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage,
+			heap_no == PAGE_HEAP_NO_INFIMUM
+			? "infimum pseudo-record"
+			: "supremum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+		return(*lock_data != NULL);
+	}
+
+	mtr_t			mtr;
+
+	const buf_block_t*	block;
+	const page_t*		page;
+	const rec_t*		rec;
+	ulint			n_fields;
+	mem_heap_t*		heap;
+	rec_offs		offsets_onstack[REC_OFFS_NORMAL_SIZE];
+	rec_offs*		offsets;
+	char			buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+	ulint			buf_used;
+	ulint			i;
+
+	mtr_start(&mtr);
+
+	block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
+
+	if (block == NULL) {
+
+		*lock_data = NULL;
+
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	page = reinterpret_cast<const page_t*>(buf_block_get_frame(block));
+
+	rec_offs_init(offsets_onstack);
+	offsets = offsets_onstack;
+
+	rec = page_find_rec_with_heap_no(page, heap_no);
+
+	const dict_index_t* index = lock->index;
+	ut_ad(index->is_primary() || !dict_index_is_online_ddl(index));
+
+	n_fields = dict_index_get_n_unique(index);
+
+	ut_a(n_fields > 0);
+
+	heap = NULL;
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  n_fields, &heap);
+
+	/* format and store the data */
+
+	buf_used = 0;
+	for (i = 0; i < n_fields; i++) {
+
+		buf_used += put_nth_field(
+			buf + buf_used, sizeof(buf) - buf_used,
+			i, index, rec, offsets) - 1;
+	}
+
+	*lock_data = (const char*) ha_storage_put_memlim(
+		cache->storage, buf, buf_used + 1,
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	if (heap != NULL) {
+
+		/* this means that rec_get_offsets() has created a new
+		heap and has stored offsets in it; check that this is
+		really the case and free the heap */
+		ut_a(offsets != offsets_onstack);
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	if (*lock_data == NULL) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/** @return the table of a lock */
+static const dict_table_t *lock_get_table(const lock_t &lock)
+{
+  if (lock.is_table())
+    return lock.un_member.tab_lock.table;
+  ut_ad(lock.index->is_primary() || !dict_index_is_online_ddl(lock.index));
+  return lock.index->table;
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return false if allocation fails */
+static bool fill_locks_row(
+	i_s_locks_row_t* row,	/*!< out: result object that's filled */
+	const lock_t*	lock,	/*!< in: lock to get data from */
+	uint16_t	heap_no,/*!< in: lock's record number
+				or 0 if the lock
+				is a table lock */
+	trx_i_s_cache_t* cache)	/*!< in/out: cache into which to copy
+				volatile strings */
+{
+	row->lock_trx_id = lock->trx->id;
+	const bool is_gap_lock = lock->is_gap();
+	ut_ad(!is_gap_lock || !lock->is_table());
+	switch (lock->mode()) {
+	case LOCK_S:
+		row->lock_mode = uint8_t(1 + is_gap_lock);
+		break;
+	case LOCK_X:
+		row->lock_mode = uint8_t(3 + is_gap_lock);
+		break;
+	case LOCK_IS:
+		row->lock_mode = uint8_t(5 + is_gap_lock);
+		break;
+	case LOCK_IX:
+		row->lock_mode = uint8_t(7 + is_gap_lock);
+		break;
+	case LOCK_AUTO_INC:
+		row->lock_mode = 9;
+		break;
+	default:
+		ut_ad("unknown lock mode" == 0);
+		row->lock_mode = 0;
+	}
+
+	const dict_table_t* table= lock_get_table(*lock);
+
+	row->lock_table = ha_storage_put_str_memlim(
+		cache->storage, table->name.m_name,
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	/* memory could not be allocated */
+	if (row->lock_table == NULL) {
+
+		return false;
+	}
+
+	if (!lock->is_table()) {
+		row->lock_index = ha_storage_put_str_memlim(
+			cache->storage, lock->index->name,
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		/* memory could not be allocated */
+		if (row->lock_index == NULL) {
+
+			return false;
+		}
+
+		row->lock_page = lock->un_member.rec_lock.page_id;
+		row->lock_rec = heap_no;
+
+		if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+			/* memory could not be allocated */
+			return false;
+		}
+	} else {
+		row->lock_index = NULL;
+
+		row->lock_page = page_id_t(0, 0);
+		row->lock_rec = 0;
+
+		row->lock_data = NULL;
+	}
+
+	row->lock_table_id = table->id;
+
+	row->hash_chain.value = row;
+	ut_ad(i_s_locks_row_validate(row));
+
+	return true;
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+	i_s_lock_waits_row_t*	row,		/*!< out: result object
+						that's filled */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	ut_ad(i_s_locks_row_validate(requested_lock_row));
+	ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+	row->requested_lock_row = requested_lock_row;
+	row->blocking_lock_row = blocking_lock_row;
+
+	return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+	const lock_t*	lock,	/*!< in: lock object to fold */
+	ulint		heap_no)/*!< in: lock's record number
+				or 0xFFFF if the lock
+				is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+	static ulint	fold = 0;
+
+	return(fold++);
+#else
+	ulint	ret;
+
+	if (!lock->is_table()) {
+		ut_a(heap_no != 0xFFFF);
+		ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+					 lock->un_member.rec_lock.page_id.
+					 fold());
+		ret = ut_fold_ulint_pair(ret, heap_no);
+	} else {
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == 0xFFFF);
+
+		ret = (ulint) lock_get_table(*lock)->id;
+	}
+
+	return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	const lock_t*		lock,	/*!< in: lock object */
+	ulint			heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
+					is a table lock */
+{
+	ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+	return(0);
+#else
+	if (!lock->is_table()) {
+		ut_a(heap_no != 0xFFFF);
+
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_page == lock->un_member.rec_lock.page_id
+		       && row->lock_rec == heap_no);
+	} else {
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == 0xFFFF);
+
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_table_id == lock_get_table(*lock)->id);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	const lock_t*		lock,	/*!< in: lock to search for */
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
+					is a table lock */
+{
+	i_s_hash_chain_t*	hash_chain;
+
+	HASH_SEARCH(
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		&cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* the type of the next variable */
+		i_s_hash_chain_t*,
+		/* auxiliary variable */
+		hash_chain,
+		/* assertion on every traversed item */
+		ut_ad(i_s_locks_row_validate(hash_chain->value)),
+		/* this determines if we have found the lock */
+		locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+	if (hash_chain == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const lock_t*		lock,	/*!< in: the element to add */
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0 if the lock
+					is a table lock */
+{
+	i_s_locks_row_t*	dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	ulint	i;
+	for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+	/* quit if this lock is already present */
+	dst_row = search_innodb_locks(cache, lock, heap_no);
+	if (dst_row != NULL) {
+
+		ut_ad(i_s_locks_row_validate(dst_row));
+		return(dst_row);
+	}
+#endif
+
+	dst_row = (i_s_locks_row_t*)
+		table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(NULL);
+	}
+
+	if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+		/* memory could not be allocated */
+		cache->innodb_locks.rows_used--;
+		return(NULL);
+	}
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+	HASH_INSERT(
+		/* the type used in the hash chain */
+		i_s_hash_chain_t,
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		&cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* add this data to the hash */
+		&dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	} /* for()-loop */
+#endif
+
+	ut_ad(i_s_locks_row_validate(dst_row));
+	return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	i_s_lock_waits_row_t*	dst_row;
+
+	dst_row = (i_s_lock_waits_row_t*)
+		table_cache_create_empty_row(&cache->innodb_lock_waits,
+					     cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(FALSE);
+	}
+
+	fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const trx_t*		trx,	/*!< in: transaction */
+	i_s_locks_row_t**	requested_lock_row)/*!< out: pointer to the
+					requested lock row, or NULL or
+					undefined */
+{
+	lock_sys.assert_locked();
+
+	/* If transaction is waiting we add the wait lock and all locks
+	from another transactions that are blocking the wait lock. */
+	if (const lock_t *wait_lock = trx->lock.wait_lock) {
+
+		const lock_t*		curr_lock;
+		i_s_locks_row_t*	blocking_lock_row;
+		lock_queue_iterator_t	iter;
+
+		uint16_t wait_lock_heap_no
+			= wait_lock_get_heap_no(wait_lock);
+
+		/* add the requested lock */
+		*requested_lock_row = add_lock_to_cache(cache, wait_lock,
+							wait_lock_heap_no);
+
+		/* memory could not be allocated */
+		if (*requested_lock_row == NULL) {
+
+			return(FALSE);
+		}
+
+		/* then iterate over the locks before the wait lock and
+		add the ones that are blocking it */
+
+		lock_queue_iterator_reset(&iter, wait_lock, ULINT_UNDEFINED);
+
+		for (curr_lock = lock_queue_iterator_get_prev(&iter);
+		     curr_lock != NULL;
+		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+			if (lock_has_to_wait(wait_lock, curr_lock)) {
+
+				/* add the lock that is
+				blocking wait_lock */
+				blocking_lock_row
+					= add_lock_to_cache(
+						cache, curr_lock,
+						/* heap_no is the same
+						for the wait and waited
+						locks */
+						wait_lock_heap_no);
+
+				/* memory could not be allocated */
+				if (blocking_lock_row == NULL) {
+
+					return(FALSE);
+				}
+
+				/* add the relation between both locks
+				to innodb_lock_waits */
+				if (!add_lock_wait_to_cache(
+						cache, *requested_lock_row,
+						blocking_lock_row)) {
+
+					/* memory could not be allocated */
+					return(FALSE);
+				}
+			}
+		}
+	} else {
+
+		*requested_lock_row = NULL;
+	}
+
+	return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in nanoseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_NS	100000000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return whether the cache can be updated */
+static bool can_cache_be_updated(trx_i_s_cache_t* cache)
+{
+	/* cache->last_read is only updated when a shared rw lock on the
+	whole cache is being held (see trx_i_s_cache_end_read()) and
+	we are currently holding an exclusive rw lock on the cache.
+	So it is not possible for last_read to be updated while we are
+	reading it. */
+	return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to clear */
+{
+	cache->innodb_trx.rows_used = 0;
+	cache->innodb_locks.rows_used = 0;
+	cache->innodb_lock_waits.rows_used = 0;
+
+	cache->locks_hash.clear();
+
+	ha_storage_empty(&cache->storage);
+}
+
+
+/**
+  Add transactions to innodb_trx's cache.
+
+  We also add all locks that are relevant to each transaction into
+  innodb_locks' and innodb_lock_waits' caches.
+*/
+
+static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
+{
+  i_s_locks_row_t *requested_lock_row;
+
+#ifdef UNIV_DEBUG
+  {
+    const auto state= trx->state;
+
+    if (trx->is_autocommit_non_locking())
+    {
+      ut_ad(trx->read_only);
+      ut_ad(!trx->is_recovered);
+      ut_ad(trx->mysql_thd);
+      ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE);
+    }
+    else
+      ut_ad(state == TRX_STATE_ACTIVE ||
+            state == TRX_STATE_PREPARED ||
+            state == TRX_STATE_PREPARED_RECOVERED ||
+            state == TRX_STATE_COMMITTED_IN_MEMORY);
+  }
+#endif /* UNIV_DEBUG */
+
+  if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row))
+  {
+    if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>(
+        table_cache_create_empty_row(&cache->innodb_trx, cache)))
+    {
+      if (fill_trx_row(trx_row, trx, requested_lock_row, cache))
+        return;
+      --cache->innodb_trx.rows_used;
+    }
+  }
+
+  /* memory could not be allocated */
+  cache->is_truncated= true;
+}
+
+
+/**
+  Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+  table cache buffer. Cache must be locked for write.
+*/
+
+static void fetch_data_into_cache(trx_i_s_cache_t *cache)
+{
+  LockMutexGuard g{SRW_LOCK_CALL};
+  trx_i_s_cache_clear(cache);
+
+  /* Capture the state of transactions */
+  trx_sys.trx_list.for_each([cache](trx_t &trx) {
+    if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+        &trx != (purge_sys.query ? purge_sys.query->trx : nullptr))
+    {
+      trx.mutex_lock();
+      if (trx.state != TRX_STATE_NOT_STARTED)
+        fetch_data_into_cache_low(cache, &trx);
+      trx.mutex_unlock();
+    }
+  });
+  cache->is_truncated= false;
+}
+
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	if (!can_cache_be_updated(cache)) {
+
+		return(1);
+	}
+
+	/* We need to read trx_sys and record/table lock queues */
+	fetch_data_into_cache(cache);
+
+	/* update cache last read time */
+	cache->last_read = my_interval_timer();
+
+	return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to init */
+{
+	/* The latching is done in the following order:
+	acquire trx_i_s_cache_t::rw_lock, rwlock
+	acquire exclusive lock_sys.latch
+	release exclusive lock_sys.latch
+	release trx_i_s_cache_t::rw_lock
+	acquire trx_i_s_cache_t::rw_lock, rdlock
+	release trx_i_s_cache_t::rw_lock */
+
+	cache->rw_lock.SRW_LOCK_INIT(trx_i_s_cache_lock_key);
+
+	cache->last_read = 0;
+
+	table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+	table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+	table_cache_init(&cache->innodb_lock_waits,
+			 sizeof(i_s_lock_waits_row_t));
+
+	cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
+
+	cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+					   CACHE_STORAGE_HASH_CELLS);
+
+	cache->mem_allocd = 0;
+
+	cache->is_truncated = false;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< in, own: cache to free */
+{
+	cache->rw_lock.destroy();
+
+	cache->locks_hash.free();
+	ha_storage_free(cache->storage);
+	table_cache_free(&cache->innodb_trx);
+	table_cache_free(&cache->innodb_locks);
+	table_cache_free(&cache->innodb_lock_waits);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->rw_lock.rd_lock(SRW_LOCK_CALL);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->last_read = my_interval_timer();
+	cache->rw_lock.rd_unlock();
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->rw_lock.wr_lock(SRW_LOCK_CALL);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->rw_lock.wr_unlock();
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+	trx_i_s_cache_t*	cache,	/*!< in: whole cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	switch (table) {
+	case I_S_INNODB_TRX:
+		return &cache->innodb_trx;
+	case I_S_INNODB_LOCKS:
+		return &cache->innodb_locks;
+	case I_S_INNODB_LOCK_WAITS:
+		return &cache->innodb_lock_waits;
+	}
+
+	ut_error;
+	return NULL;
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+	table_cache = cache_select_table(cache, table);
+
+	return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n)	/*!< in: row number */
+{
+	i_s_table_cache_t*	table_cache;
+	ulint			i;
+	void*			row;
+
+	table_cache = cache_select_table(cache, table);
+
+	ut_a(n < table_cache->rows_used);
+
+	row = NULL;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		if (table_cache->chunks[i].offset
+		    + table_cache->chunks[i].rows_allocd > n) {
+
+			row = (char*) table_cache->chunks[i].base
+				+ (n - table_cache->chunks[i].offset)
+				* table_cache->row_size;
+			break;
+		}
+	}
+
+	ut_a(row != NULL);
+
+	return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size)/*!< in: size of the lock id
+					buffer */
+{
+	int	res_len;
+
+	/* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+	if (row->lock_index) {
+		/* record lock */
+		res_len = snprintf(lock_id, lock_id_size,
+				   TRX_ID_FMT
+				   ":%u:%u:%u",
+				   row->lock_trx_id, row->lock_page.space(),
+				   row->lock_page.page_no(), row->lock_rec);
+	} else {
+		/* table lock */
+		res_len = snprintf(lock_id, lock_id_size,
+				   TRX_ID_FMT":" UINT64PF,
+				   row->lock_trx_id,
+				   row->lock_table_id);
+	}
+
+	/* the typecast is safe because snprintf(3) never returns
+	negative result */
+	ut_a(res_len >= 0);
+	ut_a((ulint) res_len < lock_id_size);
+
+	return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000..625d3223
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1416 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include <mysql/service_wsrep.h>
+
+#include <unordered_map>
+
+/** Maximum allowable purge history length.  <=0 means 'infinite'. */
+ulong		srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+ulong		srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+purge_sys_t	purge_sys;
+
+#ifdef UNIV_DEBUG
+my_bool		srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/** Sentinel value */
+static const TrxUndoRsegs NullElement;
+
+/** Default constructor */
+TrxUndoRsegsIterator::TrxUndoRsegsIterator()
+	: m_rsegs(NullElement), m_iter(m_rsegs.begin())
+{
+}
+
+/** Sets the next rseg to purge in purge_sys.
+Executed in the purge coordinator thread.
+@return whether anything is to be purged */
+TRANSACTIONAL_INLINE inline bool TrxUndoRsegsIterator::set_next()
+{
+	mysql_mutex_lock(&purge_sys.pq_mutex);
+
+	/* Only purge consumes events from the priority queue, user
+	threads only produce the events. */
+
+	/* Check if there are more rsegs to process in the
+	current element. */
+	if (m_iter != m_rsegs.end()) {
+		/* We are still processing rollback segment from
+		the same transaction and so expected transaction
+		number shouldn't increase. Undo the increment of
+		expected commit done by caller assuming rollback
+		segments from given transaction are done. */
+		purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
+	} else if (!purge_sys.purge_queue.empty()) {
+		m_rsegs = purge_sys.purge_queue.top();
+		purge_sys.purge_queue.pop();
+		ut_ad(purge_sys.purge_queue.empty()
+		      || purge_sys.purge_queue.top() != m_rsegs);
+		m_iter = m_rsegs.begin();
+	} else {
+		/* Queue is empty, reset iterator. */
+		purge_sys.rseg = NULL;
+		mysql_mutex_unlock(&purge_sys.pq_mutex);
+		m_rsegs = NullElement;
+		m_iter = m_rsegs.begin();
+		return false;
+	}
+
+	purge_sys.rseg = *m_iter++;
+	mysql_mutex_unlock(&purge_sys.pq_mutex);
+
+	/* We assume in purge of externally stored fields that space
+	id is in the range of UNDO tablespace space ids */
+	ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
+	      || srv_is_undo_tablespace(purge_sys.rseg->space->id));
+
+	trx_id_t last_trx_no;
+	{
+#ifdef SUX_LOCK_GENERIC
+		purge_sys.rseg->latch.rd_lock(SRW_LOCK_CALL);
+#else
+		transactional_shared_lock_guard<srw_spin_lock> rg
+			{purge_sys.rseg->latch};
+#endif
+		last_trx_no = purge_sys.rseg->last_trx_no();
+
+		purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+		purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+#ifdef SUX_LOCK_GENERIC
+		purge_sys.rseg->latch.rd_unlock();
+#endif
+	}
+
+	/* Only the purge coordinator task will access this object
+	purge_sys.rseg_iter, or any of purge_sys.hdr_page_no,
+	purge_sys.tail, purge_sys.head, or modify purge_sys.view. */
+	ut_ad(last_trx_no == m_rsegs.trx_no);
+	ut_a(purge_sys.hdr_page_no != FIL_NULL);
+	ut_a(purge_sys.tail.trx_no <= last_trx_no);
+	purge_sys.tail.trx_no = last_trx_no;
+
+	return(true);
+}
+
+/** Build a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+purge_graph_build()
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	trx_t* trx = trx_create();
+	ut_ad(!trx->id);
+	trx->start_time = time(NULL);
+	trx->start_time_micro = microsecond_interval_timer();
+	trx->state = TRX_STATE_ACTIVE;
+	trx->op_info = "purge trx";
+
+	mem_heap_t*	heap = mem_heap_create(512);
+	que_fork_t*	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	for (auto i = innodb_purge_threads_MAX; i; i--) {
+		que_thr_t*	thr = que_thr_create(fork, heap, NULL);
+		thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
+			purge_node_t(thr);
+	}
+
+	return(fork);
+}
+
+/** Initialise the purge system. */
+void purge_sys_t::create()
+{
+  ut_ad(this == &purge_sys);
+  ut_ad(!heap);
+  ut_ad(!enabled());
+  m_paused= 0;
+  m_SYS_paused= 0;
+  query= purge_graph_build();
+  next_stored= false;
+  rseg= NULL;
+  page_no= 0;
+  offset= 0;
+  hdr_page_no= 0;
+  hdr_offset= 0;
+  latch.SRW_LOCK_INIT(trx_purge_latch_key);
+  end_latch.init();
+  mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr);
+  truncate.current= NULL;
+  truncate.last= NULL;
+  heap= mem_heap_create(4096);
+}
+
+/** Close the purge subsystem on shutdown. */
+void purge_sys_t::close()
+{
+  ut_ad(this == &purge_sys);
+  if (!heap)
+    return;
+
+  ut_ad(!enabled());
+  trx_t* trx = query->trx;
+  que_graph_free(query);
+  ut_ad(!trx->id);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  trx->state= TRX_STATE_NOT_STARTED;
+  trx->free();
+  latch.destroy();
+  end_latch.destroy();
+  mysql_mutex_destroy(&pq_mutex);
+  mem_heap_free(heap);
+  heap= nullptr;
+}
+
+/** Determine if the history of a transaction is purgeable.
+@param trx_id  transaction identifier
+@return whether the history is purgeable */
+TRANSACTIONAL_TARGET bool purge_sys_t::is_purgeable(trx_id_t trx_id) const
+{
+  bool purgeable;
+#if !defined SUX_LOCK_GENERIC && !defined NO_ELISION
+  purgeable= false;
+  if (xbegin())
+  {
+    if (!latch.is_write_locked())
+    {
+      purgeable= view.changes_visible(trx_id);
+      xend();
+    }
+    else
+      xabort();
+  }
+  else
+#endif
+  {
+    latch.rd_lock(SRW_LOCK_CALL);
+    purgeable= view.changes_visible(trx_id);
+    latch.rd_unlock();
+  }
+  return purgeable;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in,out]	mtr		mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
+{
+	DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
+			   trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
+	ut_ad(undo == trx->rsegs.m_redo.undo);
+	trx_rseg_t*	rseg		= trx->rsegs.m_redo.rseg;
+	ut_ad(undo->rseg == rseg);
+	buf_block_t*	rseg_header	= rseg->get(mtr, nullptr);
+	/* We are in transaction commit; we cannot return an error. If the
+	database is corrupted, it is better to crash it than to
+	intentionally violate ACID by committing something that is known to
+	be corrupted. */
+	ut_ad(rseg_header);
+	buf_block_t*	undo_page	= trx_undo_set_state_at_finish(
+		undo, mtr);
+	trx_ulogf_t*	undo_header	= undo_page->page.frame
+		+ undo->hdr_offset;
+
+	ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+					   + rseg_header->page.frame))) {
+		/* This database must have been upgraded from
+		before MariaDB 10.3.5. */
+		trx_rseg_format_upgrade(rseg_header, mtr);
+	}
+
+	if (undo->state != TRX_UNDO_CACHED) {
+		/* The undo log segment will not be reused */
+		ut_a(undo->id < TRX_RSEG_N_SLOTS);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+			    + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
+		uint32_t hist_size = mach_read_from_4(
+			TRX_RSEG_HISTORY_SIZE + TRX_RSEG
+			+ rseg_header->page.frame);
+
+		ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR
+						 + TRX_UNDO_PAGE_LIST
+						 + undo_page->page.frame));
+
+		mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			      + rseg_header->page.frame,
+			      hist_size + undo->size);
+		mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+			      + rseg_header->page.frame,
+			      trx_sys.get_max_trx_id());
+	}
+
+	/* After the purge thread has been given permission to exit,
+	we may roll back transactions (trx->undo_no==0)
+	in THD::cleanup() invoked from unlink_thd() in fast shutdown,
+	or in trx_rollback_recovered() in slow shutdown.
+
+	Before any transaction-generating background threads or the
+	purge have been started, we can
+	start transactions in row_merge_drop_temp_indexes(),
+	and roll back recovered transactions.
+
+	Arbitrary user transactions may be executed when all the undo log
+	related background processes (including purge) are disabled due to
+	innodb_force_recovery=2 or innodb_force_recovery=3.
+	DROP TABLE may be executed at any innodb_force_recovery	level.
+
+	During fast shutdown, we may also continue to execute
+	user transactions. */
+	ut_ad(srv_undo_sources
+	      || trx->undo_no == 0
+	      || (!purge_sys.enabled()
+		  && (srv_is_being_started
+		      || trx_rollback_is_active
+		      || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+	      || srv_fast_shutdown);
+
+#ifdef	WITH_WSREP
+	if (wsrep_is_wsrep_xid(&trx->xid)) {
+		trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr);
+	}
+#endif
+
+	if (trx->mysql_log_file_name && *trx->mysql_log_file_name) {
+		/* Update the latest MySQL binlog name and offset info
+		in rollback segment header if MySQL binlogging is on
+		or the database server is a MySQL replication save. */
+		trx_rseg_update_binlog_offset(rseg_header, trx, mtr);
+	}
+
+	/* Add the log as the first in the history list */
+
+	/* We are in transaction commit; we cannot return an error
+	when detecting corruption. It is better to crash the server
+	than to intentionally violate ACID by committing something
+	that is known to be corrupted. */
+	ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+			    static_cast<uint16_t>(undo->hdr_offset
+						  + TRX_UNDO_HISTORY_NODE),
+			    mtr) == DB_SUCCESS);
+
+	mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page,
+				       undo_header + TRX_UNDO_TRX_NO,
+				       trx->rw_trx_hash_element->no);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header
+				       + TRX_UNDO_NEEDS_PURGE, 1U);
+
+	if (rseg->last_page_no == FIL_NULL) {
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->set_last_commit(undo->hdr_offset,
+				      trx->rw_trx_hash_element->no);
+		rseg->set_needs_purge();
+	}
+
+	rseg->history_size++;
+
+	if (undo->state == TRX_UNDO_CACHED) {
+		UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+		ut_free(undo);
+	}
+
+	undo = NULL;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Remove undo log header from the history list.
+@param[in,out]  rseg    rollback segment header page
+@param[in]      log     undo log segment header page
+@param[in]      offset  byte offset in the undo log segment header page
+@param[in,out]  mtr     mini-transaction */
+static dberr_t trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
+                                        uint16_t offset, mtr_t *mtr)
+{
+  return flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY, log,
+                     uint16_t(offset + TRX_UNDO_HISTORY_NODE), mtr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Free an undo log segment, and remove the header from the history list.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_addr	file address of log_hdr
+@return error code */
+static dberr_t trx_purge_free_segment(trx_rseg_t *rseg, fil_addr_t hdr_addr)
+{
+  const page_id_t hdr_page_id{rseg->space->id, hdr_addr.page};
+  mtr_t mtr;
+  mtr.start();
+
+  /* We only need the latch to maintain rseg->curr_size. To follow the
+  latching order, we must acquire it before acquiring any related
+  page latch.  */
+  rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+  dberr_t err;
+  buf_block_t *rseg_hdr= rseg->get(&mtr, &err);
+  if (!rseg_hdr)
+    goto func_exit;
+  if (buf_block_t *block= buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH,
+                                           nullptr, BUF_GET_POSSIBLY_FREED,
+                                           &mtr, &err))
+  {
+    /* Mark the last undo log totally purged, so that if the system
+    crashes, the tail of the undo log will not get accessed again. The
+    list of pages in the undo log tail gets inconsistent during the
+    freeing of the segment, and therefore purge should not try to
+    access them again. */
+    mtr.write<2,mtr_t::MAYBE_NOP>(*block, block->page.frame +
+                                  hdr_addr.boffset + TRX_UNDO_NEEDS_PURGE, 0U);
+    while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                                      block->page.frame, &mtr))
+    {
+      rseg->latch.wr_unlock();
+      rseg_hdr->fix();
+      block->fix();
+      mtr.commit();
+      mtr.start();
+      mtr.flag_modified();
+      rseg->latch.wr_lock(SRW_LOCK_CALL);
+      rseg_hdr->page.lock.x_lock();
+      block->page.lock.x_lock();
+      mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_FIX);
+      mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
+    }
+
+    /* The page list may now be inconsistent, but the length field
+    stored in the list base node tells us how big it was before we
+    started the freeing. */
+    const uint32_t seg_size=
+      flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
+
+    /* We may free the undo log segment header page; it must be freed
+    within the same mtr as the undo log header is removed from the
+    history list: otherwise, in case of a database crash, the segment
+    could become inaccessible garbage in the file space. */
+    err= trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
+    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+      goto func_exit;
+    byte *hist= TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->page.frame;
+    if (UNIV_UNLIKELY(mach_read_from_4(hist) < seg_size))
+    {
+      err= DB_CORRUPTION;
+      goto func_exit;
+    }
+    mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
+
+    /* Here we assume that a file segment with just the header page
+    can be freed in a few steps, so that the buffer pool is not
+    flooded with bufferfixed pages: see the note in fsp0fsp.cc. */
+    while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                           block->page.frame, &mtr));
+
+    ut_ad(rseg->curr_size >= seg_size);
+
+    rseg->history_size--;
+    rseg->curr_size -= seg_size;
+  }
+
+func_exit:
+  rseg->latch.wr_unlock();
+  mtr.commit();
+  return err;
+}
+
+/** Remove unnecessary history data from a rollback segment.
+@param[in,out]	rseg		rollback segment
+@param[in]	limit		truncate anything before this
+@return error code */
+static
+dberr_t
+trx_purge_truncate_rseg_history(
+	trx_rseg_t&			rseg,
+	const purge_sys_t::iterator&	limit)
+{
+	fil_addr_t	hdr_addr;
+	mtr_t		mtr;
+
+	mtr.start();
+	ut_ad(rseg.is_persistent());
+	rseg.latch.wr_lock(SRW_LOCK_CALL);
+
+	dberr_t err;
+	buf_block_t* rseg_hdr = rseg.get(&mtr, &err);
+	if (!rseg_hdr) {
+		goto func_exit;
+	}
+
+	hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY
+				 + rseg_hdr->page.frame);
+	hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset
+						 - TRX_UNDO_HISTORY_NODE);
+
+loop:
+	if (hdr_addr.page == FIL_NULL) {
+func_exit:
+		rseg.latch.wr_unlock();
+		mtr.commit();
+		return err;
+	}
+
+	buf_block_t* block = buf_page_get_gen(page_id_t(rseg.space->id,
+							hdr_addr.page),
+					      0, RW_X_LATCH, nullptr,
+					      BUF_GET_POSSIBLY_FREED,
+					      &mtr, &err);
+	if (!block) {
+		goto func_exit;
+	}
+
+	const trx_id_t undo_trx_no = mach_read_from_8(
+		block->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO);
+
+	if (undo_trx_no >= limit.trx_no) {
+		if (undo_trx_no == limit.trx_no) {
+			err = trx_undo_truncate_start(
+				&rseg, hdr_addr.page,
+				hdr_addr.boffset, limit.undo_no);
+		}
+
+		goto func_exit;
+	}
+
+	fil_addr_t prev_hdr_addr = flst_get_prev_addr(
+		block->page.frame + hdr_addr.boffset + TRX_UNDO_HISTORY_NODE);
+	prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset
+						      - TRX_UNDO_HISTORY_NODE);
+
+	if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+			     + block->page.frame)
+	    == TRX_UNDO_TO_PURGE
+	    && !mach_read_from_2(block->page.frame + hdr_addr.boffset
+				 + TRX_UNDO_NEXT_LOG)) {
+
+		/* We can free the whole log segment */
+
+		rseg.latch.wr_unlock();
+		mtr.commit();
+
+		/* calls the trx_purge_remove_log_hdr()
+		inside trx_purge_free_segment(). */
+		err = trx_purge_free_segment(&rseg, hdr_addr);
+		if (err != DB_SUCCESS) {
+			return err;
+		}
+	} else {
+		/* Remove the log hdr from the rseg history. */
+		err = trx_purge_remove_log_hdr(rseg_hdr, block,
+					       hdr_addr.boffset, &mtr);
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		rseg.history_size--;
+		rseg.latch.wr_unlock();
+		mtr.commit();
+	}
+
+	mtr.start();
+	rseg.latch.wr_lock(SRW_LOCK_CALL);
+
+	hdr_addr = prev_hdr_addr;
+
+	rseg_hdr = rseg.get(&mtr, &err);
+	if (!rseg_hdr) {
+		goto func_exit;
+	}
+
+	goto loop;
+}
+
+/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+marked for truncate.
+@param[in]	space	undo tablespace being truncated */
+static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+{
+	typedef	std::vector<TrxUndoRsegs>	purge_elem_list_t;
+	purge_elem_list_t			purge_elem_list;
+
+	mysql_mutex_lock(&purge_sys.pq_mutex);
+
+	/* Remove rseg instances that are in the purge queue before we start
+	truncate of corresponding UNDO truncate. */
+	while (!purge_sys.purge_queue.empty()) {
+		purge_elem_list.push_back(purge_sys.purge_queue.top());
+		purge_sys.purge_queue.pop();
+	}
+
+	for (purge_elem_list_t::iterator it = purge_elem_list.begin();
+	     it != purge_elem_list.end();
+	     ++it) {
+
+		for (TrxUndoRsegs::iterator it2 = it->begin();
+		     it2 != it->end();
+		     ++it2) {
+			if ((*it2)->space == &space) {
+				it->erase(it2);
+				break;
+			}
+		}
+
+		if (!it->empty()) {
+			purge_sys.purge_queue.push(*it);
+		}
+	}
+
+	mysql_mutex_unlock(&purge_sys.pq_mutex);
+}
+
+#if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__
+# if defined __arm__ || defined __aarch64__
+/* Work around an internal compiler error in GCC 4.8.5 */
+__attribute__((optimize(0)))
+# endif
+#endif
+/**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages!
+*/
+TRANSACTIONAL_TARGET static void trx_purge_truncate_history()
+{
+  ut_ad(purge_sys.head <= purge_sys.tail);
+  purge_sys_t::iterator &head= purge_sys.head.trx_no
+    ? purge_sys.head : purge_sys.tail;
+
+  if (head.trx_no >= purge_sys.low_limit_no())
+  {
+    /* This is sometimes necessary. TODO: find out why. */
+    head.trx_no= purge_sys.low_limit_no();
+    head.undo_no= 0;
+  }
+
+  dberr_t err= DB_SUCCESS;
+  for (auto &rseg : trx_sys.rseg_array)
+    if (rseg.space)
+      if (dberr_t e= trx_purge_truncate_rseg_history(rseg, head))
+        err= e;
+
+  if (err != DB_SUCCESS || srv_undo_tablespaces_active < 2)
+    return;
+
+  while (srv_undo_log_truncate)
+  {
+    if (!purge_sys.truncate.current)
+    {
+      const ulint threshold=
+        ulint(srv_max_undo_log_size >> srv_page_size_shift);
+      for (ulint i= purge_sys.truncate.last
+           ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0,
+           j= i;; )
+      {
+        const auto space_id= srv_undo_space_id_start + i;
+        ut_ad(srv_is_undo_tablespace(space_id));
+        fil_space_t *space= fil_space_get(space_id);
+        ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
+        if (space && space->get_size() > threshold)
+        {
+          purge_sys.truncate.current= space;
+          break;
+        }
+
+        ++i;
+        i %= srv_undo_tablespaces_active;
+        if (i == j)
+          return;
+      }
+    }
+
+    fil_space_t &space= *purge_sys.truncate.current;
+    /* Undo tablespace always are a single file. */
+    fil_node_t *file= UT_LIST_GET_FIRST(space.chain);
+    /* The undo tablespace files are never closed. */
+    ut_ad(file->is_open());
+
+    DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+    for (auto &rseg : trx_sys.rseg_array)
+      if (rseg.space == &space)
+        /* Once set, this rseg will not be allocated to subsequent
+        transactions, but we will wait for existing active
+        transactions to finish. */
+        rseg.set_skip_allocation();
+
+    for (auto &rseg : trx_sys.rseg_array)
+    {
+      if (rseg.space != &space)
+        continue;
+#ifdef SUX_LOCK_GENERIC
+      rseg.latch.rd_lock(SRW_LOCK_CALL);
+#else
+      transactional_shared_lock_guard<srw_spin_lock> g{rseg.latch};
+#endif
+      ut_ad(rseg.skip_allocation());
+      if (rseg.is_referenced())
+      {
+not_free:
+#ifdef SUX_LOCK_GENERIC
+        rseg.latch.rd_unlock();
+#endif
+        return;
+      }
+
+      if (rseg.curr_size != 1)
+      {
+        /* Check if all segments are cached and safe to remove. */
+        ulint cached= 0;
+        for (trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached); undo;
+             undo= UT_LIST_GET_NEXT(undo_list, undo))
+        {
+          if (head.trx_no < undo->trx_id)
+            goto not_free;
+          else
+            cached+= undo->size;
+        }
+
+        ut_ad(rseg.curr_size > cached);
+
+        if (rseg.curr_size > cached + 1)
+          goto not_free;
+      }
+
+#ifdef SUX_LOCK_GENERIC
+      rseg.latch.rd_unlock();
+#endif
+    }
+
+    ib::info() << "Truncating " << file->name;
+    trx_purge_cleanse_purge_queue(space);
+
+    log_free_check();
+
+    mtr_t mtr;
+    mtr.start();
+    mtr.x_lock_space(&space);
+
+    /* Lock all modified pages of the tablespace.
+
+    During truncation, we do not want any writes to the file.
+
+    If a log checkpoint was completed at LSN earlier than our
+    mini-transaction commit and the server was killed, then
+    discarding the to-be-trimmed pages without flushing would
+    break crash recovery. */
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+    for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+    {
+      ut_ad(bpage->oldest_modification());
+      ut_ad(bpage->in_file());
+
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+      if (bpage->id().space() == space.id &&
+          bpage->oldest_modification() != 1)
+      {
+        ut_ad(bpage->frame);
+        auto block= reinterpret_cast<buf_block_t*>(bpage);
+        if (!bpage->lock.x_lock_try())
+        {
+          /* Let buf_pool_t::release_freed_page() proceed. */
+          mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+          std::this_thread::yield();
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        rescan:
+          bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+          continue;
+        }
+        buf_pool.flush_hp.set(prev);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+#ifdef BTR_CUR_HASH_ADAPT
+        ut_ad(!block->index); /* There is no AHI on undo tablespaces. */
+#endif
+        bpage->fix();
+        ut_ad(!bpage->is_io_fixed());
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+        if (bpage->oldest_modification() > 1)
+        {
+          bpage->reset_oldest_modification();
+          mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+        }
+        else
+        {
+          bpage->unfix();
+          bpage->lock.x_unlock();
+        }
+
+        if (prev != buf_pool.flush_hp.get())
+          /* Rescan, because we may have lost the position. */
+          goto rescan;
+      }
+
+      bpage= prev;
+    }
+
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    /* Re-initialize tablespace, in a single mini-transaction. */
+    const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+
+    /* Adjust the tablespace metadata. */
+    mysql_mutex_lock(&fil_system.mutex);
+    space.set_stopping();
+    space.is_being_truncated= true;
+    if (space.crypt_data)
+    {
+      space.reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      fil_space_crypt_close_tablespace(&space);
+      space.release();
+    }
+    else
+      mysql_mutex_unlock(&fil_system.mutex);
+
+    for (auto i= 6000; space.referenced();
+         std::this_thread::sleep_for(std::chrono::milliseconds(10)))
+    {
+      if (!--i)
+      {
+        mtr.commit();
+        ib::error() << "Failed to freeze UNDO tablespace " << file->name;
+        return;
+      }
+    }
+
+    /* Associate the undo tablespace with mtr.
+    During mtr::commit_shrink(), InnoDB can use the undo
+    tablespace object to clear all freed ranges */
+    mtr.set_named_space(&space);
+    mtr.trim_pages(page_id_t(space.id, size));
+    ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS);
+    mysql_mutex_lock(&fil_system.mutex);
+    space.size= file->size= size;
+    mysql_mutex_unlock(&fil_system.mutex);
+
+    for (auto &rseg : trx_sys.rseg_array)
+    {
+      if (rseg.space != &space)
+        continue;
+
+      dberr_t err;
+      buf_block_t *rblock= trx_rseg_header_create(&space,
+                                                  &rseg - trx_sys.rseg_array,
+                                                  trx_sys.get_max_trx_id(),
+                                                  &mtr, &err);
+      ut_a(rblock);
+      /* These were written by trx_rseg_header_create(). */
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                              rblock->page.frame));
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                              rblock->page.frame));
+      rseg.reinit(rblock->page.id().page_no());
+    }
+
+    mtr.commit_shrink(space);
+
+    /* No mutex; this is only updated by the purge coordinator. */
+    export_vars.innodb_undo_truncations++;
+
+    if (purge_sys.rseg && purge_sys.rseg->last_page_no == FIL_NULL)
+    {
+      /* If purge_sys.rseg is pointing to rseg that was recently
+      truncated then move to next rseg element.
+
+      Note: Ideally purge_sys.rseg should be NULL because purge should
+      complete processing of all the records but srv_purge_batch_size
+      can force the purge loop to exit before all the records are purged. */
+      purge_sys.rseg= nullptr;
+      purge_sys.next_stored= false;
+    }
+
+    DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc";
+                    log_buffer_flush_to_disk();
+                    DBUG_SUICIDE(););
+
+    for (auto &rseg : trx_sys.rseg_array)
+      if (rseg.space == &space)
+        rseg.clear_skip_allocation();
+
+    ib::info() << "Truncated " << file->name;
+    purge_sys.truncate.last= purge_sys.truncate.current;
+    ut_ad(&space == purge_sys.truncate.current);
+    purge_sys.truncate.current= nullptr;
+  }
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys.purge_trx_no past the purged log. */
+static void trx_purge_rseg_get_next_history_log(
+	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
+					handled */
+{
+  fil_addr_t prev_log_addr;
+  mtr_t mtr;
+
+  mtr.start();
+
+  purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+  ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+
+  purge_sys.tail.trx_no= purge_sys.rseg->last_trx_no() + 1;
+  purge_sys.tail.undo_no= 0;
+  purge_sys.next_stored= false;
+
+  if (const buf_block_t* undo_page=
+      buf_page_get_gen(page_id_t(purge_sys.rseg->space->id,
+                                 purge_sys.rseg->last_page_no),
+                       0, RW_S_LATCH, nullptr,
+                       BUF_GET_POSSIBLY_FREED, &mtr))
+  {
+    const trx_ulogf_t *log_hdr=
+      undo_page->page.frame + purge_sys.rseg->last_offset();
+    /* Increase the purge page count by one for every handled log */
+    ++*n_pages_handled;
+    prev_log_addr= flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+    prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset -
+                                                  TRX_UNDO_HISTORY_NODE);
+  }
+  else
+    prev_log_addr.page= FIL_NULL;
+
+  const bool empty= prev_log_addr.page == FIL_NULL;
+
+  if (empty)
+    /* No logs left in the history list */
+    purge_sys.rseg->last_page_no= FIL_NULL;
+
+  purge_sys.rseg->latch.wr_unlock();
+  mtr.commit();
+
+  if (empty)
+    return;
+
+  /* Read the previous log header. */
+  mtr.start();
+
+  byte needs_purge= 0;
+  trx_id_t trx_no= 0;
+
+  if (const buf_block_t* undo_page=
+      buf_page_get_gen(page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
+                       0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr))
+  {
+    const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset;
+
+    trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+    ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+    needs_purge= log_hdr[TRX_UNDO_NEEDS_PURGE + 1];
+  }
+
+  mtr.commit();
+
+  if (UNIV_UNLIKELY(!trx_no))
+    return;
+
+  purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
+  purge_sys.rseg->last_page_no= prev_log_addr.page;
+  purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+
+  if (needs_purge)
+    purge_sys.rseg->set_needs_purge();
+  else
+    purge_sys.rseg->clear_needs_purge();
+
+  /* Purge can also produce events, however these are already ordered
+  in the rollback segment and any user generated event will be greater
+  than the events that Purge produces. ie. Purge can never produce
+  events from an empty rollback segment. */
+
+  mysql_mutex_lock(&purge_sys.pq_mutex);
+  purge_sys.purge_queue.push(*purge_sys.rseg);
+  mysql_mutex_unlock(&purge_sys.pq_mutex);
+  purge_sys.rseg->latch.wr_unlock();
+}
+
+/** Position the purge sys "iterator" on the undo record to use for purging. */
+static void trx_purge_read_undo_rec()
+{
+	uint16_t	offset;
+	uint32_t	page_no;
+	ib_uint64_t	undo_no;
+
+	purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+	page_no = purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+	if (purge_sys.rseg->needs_purge()) {
+		mtr_t		mtr;
+		mtr.start();
+		const buf_block_t* undo_page;
+		if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec(
+			    *purge_sys.rseg->space, purge_sys.hdr_page_no,
+			    purge_sys.hdr_offset, RW_S_LATCH,
+			    undo_page, &mtr, nullptr)) {
+
+			offset = page_offset(undo_rec);
+			undo_no = trx_undo_rec_get_undo_no(undo_rec);
+			page_no = undo_page->page.id().page_no();
+		} else {
+			offset = 0;
+			undo_no = 0;
+		}
+
+		mtr.commit();
+	} else {
+		offset = 0;
+		undo_no = 0;
+	}
+
+	purge_sys.offset = offset;
+	purge_sys.page_no = page_no;
+	purge_sys.tail.undo_no = undo_no;
+
+	purge_sys.next_stored = true;
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+TRANSACTIONAL_TARGET static void trx_purge_choose_next_log()
+{
+	ut_ad(!purge_sys.next_stored);
+
+	if (purge_sys.rseg_iter.set_next()) {
+		trx_purge_read_undo_rec();
+	} else {
+		/* There is nothing to do yet. */
+		std::this_thread::yield();
+	}
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return copy of an undo log record
+@retval -1 if there is nothing to purge
+@retval nullptr on corruption */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	mtr_t		mtr;
+
+	ut_ad(purge_sys.next_stored);
+	ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no());
+
+	const page_id_t page_id{purge_sys.rseg->space->id, purge_sys.page_no};
+	const uint16_t offset = purge_sys.offset;
+
+	if (offset == 0) {
+		/* It is the dummy undo log record, which means that there is
+		no need to purge this undo log */
+
+		trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+		return reinterpret_cast<trx_undo_rec_t*>(-1);
+	}
+
+	mtr.start();
+
+	const buf_block_t* undo_page
+		= buf_page_get_gen(page_id, 0, RW_S_LATCH, nullptr,
+				   BUF_GET_POSSIBLY_FREED, &mtr);
+	if (UNIV_UNLIKELY(!undo_page)) {
+corrupted:
+		mtr.commit();
+		return nullptr;
+	}
+
+	const buf_block_t* rec2_page = undo_page;
+
+	const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec(
+		undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset);
+
+	if (rec2 == NULL) {
+		rec2 = trx_undo_get_next_rec(rec2_page, offset,
+					     purge_sys.hdr_page_no,
+					     purge_sys.hdr_offset, &mtr);
+	}
+
+	if (rec2 == NULL) {
+		mtr_commit(&mtr);
+
+		trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		mtr_start(&mtr);
+
+		undo_page = buf_page_get_gen(page_id, 0, RW_S_LATCH,
+					     nullptr, BUF_GET_POSSIBLY_FREED,
+					     &mtr);
+		if (UNIV_UNLIKELY(!undo_page)) {
+			goto corrupted;
+		}
+	} else {
+		purge_sys.offset = page_offset(rec2);
+		purge_sys.page_no = rec2_page->page.id().page_no();
+		purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2);
+
+		if (undo_page != rec2_page) {
+			/* We advance to a new page of the undo log: */
+			(*n_pages_handled)++;
+		}
+	}
+
+	trx_undo_rec_t*	rec_copy = trx_undo_rec_copy(undo_page->page.frame
+						     + offset, heap);
+
+	mtr.commit();
+	return rec_copy;
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record
+@retval -1 if the whole undo log can skipped in purge
+@retval nullptr if nothing is left, or on corruption */
+static MY_ATTRIBUTE((warn_unused_result))
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+	roll_ptr_t*	roll_ptr,	/*!< out: roll pointer to undo record */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO log pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	if (!purge_sys.next_stored) {
+		trx_purge_choose_next_log();
+
+		if (!purge_sys.next_stored) {
+			DBUG_PRINT("ib_purge",
+				   ("no logs left in the history list"));
+			return nullptr;
+		}
+	}
+
+	if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) {
+		return nullptr;
+	}
+
+	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+	pthread_self(), iter->trx_no, iter->undo_no); */
+
+	*roll_ptr = trx_undo_build_roll_ptr(
+		/* row_purge_record_func() will later set
+		ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */
+		false,
+		trx_sys.rseg_id(purge_sys.rseg, true),
+		purge_sys.page_no, purge_sys.offset);
+
+	/* The following call will advance the stored values of the
+	purge iterator. */
+
+	return trx_purge_get_next_rec(n_pages_handled, heap);
+}
+
+/** Run a purge batch.
+@param n_purge_threads	number of purge threads
+@return number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(ulint n_purge_threads)
+{
+	que_thr_t*	thr;
+	ulint		i;
+	ulint		n_pages_handled = 0;
+	ulint		n_thrs = UT_LIST_GET_LEN(purge_sys.query->thrs);
+
+	ut_a(n_purge_threads > 0);
+
+	purge_sys.head = purge_sys.tail;
+
+#ifdef UNIV_DEBUG
+	i = 0;
+	/* Debug code to validate some pre-requisites and reset done flag. */
+	for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+	     thr != NULL && i < n_purge_threads;
+	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+		purge_node_t*		node;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+
+		ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+		ut_ad(node->undo_recs.empty());
+		ut_ad(!node->in_progress);
+		ut_d(node->in_progress = true);
+	}
+
+	/* There should never be fewer nodes than threads, the inverse
+	however is allowed because we only use purge threads as needed. */
+	ut_ad(i == n_purge_threads);
+#endif
+
+	/* Fetch and parse the UNDO records. The UNDO records are added
+	to a per purge node vector. */
+	thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+	ut_a(n_thrs > 0 && thr != NULL);
+
+	ut_ad(purge_sys.head <= purge_sys.tail);
+
+	i = 0;
+
+	std::unordered_map<table_id_t, purge_node_t*> table_id_map;
+	mem_heap_empty(purge_sys.heap);
+
+	while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
+		purge_node_t*		node;
+		trx_purge_rec_t		purge_rec;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+
+		/* Track the max {trx_id, undo_no} for truncating the
+		UNDO logs once we have purged the records. */
+
+		if (purge_sys.head <= purge_sys.tail) {
+			purge_sys.head = purge_sys.tail;
+		}
+
+		/* Fetch the next record, and advance the purge_sys.tail. */
+		purge_rec.undo_rec = trx_purge_fetch_next_rec(
+			&purge_rec.roll_ptr, &n_pages_handled,
+			purge_sys.heap);
+
+		if (purge_rec.undo_rec == NULL) {
+			break;
+		} else if (purge_rec.undo_rec
+			   == reinterpret_cast<trx_undo_rec_t*>(-1)) {
+			continue;
+		}
+
+		table_id_t table_id = trx_undo_rec_get_table_id(
+			purge_rec.undo_rec);
+
+		purge_node_t *& table_node = table_id_map[table_id];
+
+		if (table_node) {
+			node = table_node;
+		} else {
+			thr = UT_LIST_GET_NEXT(thrs, thr);
+
+			if (!(++i % n_purge_threads)) {
+				thr = UT_LIST_GET_FIRST(
+					purge_sys.query->thrs);
+			}
+
+			ut_a(thr != NULL);
+			table_node = node;
+		}
+
+		node->undo_recs.push(purge_rec);
+
+		if (n_pages_handled >= srv_purge_batch_size) {
+			break;
+		}
+	}
+
+	ut_ad(purge_sys.head <= purge_sys.tail);
+
+	return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+	/* Determine how much data manipulation language (DML) statements
+	need to be delayed in order to reduce the lagging of the purge
+	thread. */
+	ulint	delay = 0; /* in microseconds; default: no delay */
+
+	/* If purge lag is set then calculate the new DML delay. */
+
+	if (srv_max_purge_lag > 0) {
+		double ratio = static_cast<double>(trx_sys.history_size()) /
+			static_cast<double>(srv_max_purge_lag);
+
+		if (ratio > 1.0) {
+			/* If the history list length exceeds the
+			srv_max_purge_lag, the data manipulation
+			statements are delayed by at least 5000
+			microseconds. */
+			delay = (ulint) ((ratio - .5) * 10000);
+		}
+
+		if (delay > srv_max_purge_lag_delay) {
+			delay = srv_max_purge_lag_delay;
+		}
+
+		MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+	}
+
+	return(delay);
+}
+
+extern tpool::waitable_task purge_worker_task;
+
+/** Wait for pending purge jobs to complete. */
+static void trx_purge_wait_for_workers_to_complete()
+{
+  const bool notify_wait{purge_worker_task.is_running()};
+
+  if (notify_wait)
+    tpool::tpool_wait_begin();
+
+  purge_worker_task.wait();
+
+  if (notify_wait)
+    tpool::tpool_wait_end();
+
+  /* There should be no outstanding tasks as long
+  as the worker threads are active. */
+  ut_ad(srv_get_task_queue_length() == 0);
+}
+
+/** Update end_view at the end of a purge batch. */
+TRANSACTIONAL_INLINE void purge_sys_t::clone_end_view()
+{
+  /* This is only invoked only by the purge coordinator,
+  which is the only thread that can modify our inputs head, tail, view.
+  Therefore, we only need to protect end_view from concurrent reads. */
+
+  /* Limit the end_view similar to what trx_purge_truncate_history() does. */
+  const trx_id_t trx_no= head.trx_no ? head.trx_no : tail.trx_no;
+#ifdef SUX_LOCK_GENERIC
+  end_latch.wr_lock();
+#else
+  transactional_lock_guard<srw_spin_lock_low> g(end_latch);
+#endif
+  end_view= view;
+  end_view.clamp_low_limit_id(trx_no);
+#ifdef SUX_LOCK_GENERIC
+  end_latch.wr_unlock();
+#endif
+}
+
+/**
+Run a purge batch.
+@param n_tasks   number of purge tasks to submit to the queue
+@param truncate  whether to truncate the history at the end of the batch
+@return number of undo log pages handled in the batch */
+TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, bool truncate)
+{
+	que_thr_t*	thr = NULL;
+	ulint		n_pages_handled;
+
+	ut_ad(n_tasks > 0);
+
+	srv_dml_needed_delay = trx_purge_dml_delay();
+
+	purge_sys.clone_oldest_view();
+
+#ifdef UNIV_DEBUG
+	if (srv_purge_view_update_only_debug) {
+		return(0);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Fetch the UNDO recs that need to be purged. */
+	n_pages_handled = trx_purge_attach_undo_recs(n_tasks);
+
+	/* Submit tasks to workers queue if using multi-threaded purge. */
+	for (ulint i = n_tasks; --i; ) {
+		thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+		ut_a(thr);
+		srv_que_task_enqueue_low(thr);
+		srv_thread_pool->submit_task(&purge_worker_task);
+	}
+
+	thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+
+	que_run_threads(thr);
+
+	trx_purge_wait_for_workers_to_complete();
+
+	purge_sys.clone_end_view();
+
+	if (truncate) {
+		trx_purge_truncate_history();
+	}
+
+	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
+
+	return(n_pages_handled);
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000..dc24f083
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,2426 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+#include "row0mysql.h"
+#include "row0ins.h"
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
+const dtuple_t trx_undo_metadata = {
+	/* This also works for REC_INFO_METADATA_ALTER, because the
+	delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */
+	REC_INFO_METADATA_ADD, 0, 0,
+	NULL, 0, NULL
+#ifdef UNIV_DEBUG
+	, DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+};
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/** Calculate the free space left for extending an undo log record.
+@param undo_block    undo log page
+@param ptr           current end of the undo page
+@return bytes left */
+static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
+{
+  ut_ad(ptr >=
+        &undo_block->page.frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+  /* The 10 is supposed to be an extra safety margin (and needed for
+  compatibility with older versions) */
+  lint left= srv_page_size - (ptr - undo_block->page.frame) -
+    (10 + FIL_PAGE_DATA_END);
+  ut_ad(left >= 0);
+  return left < 0 ? 0 : static_cast<ulint>(left);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+uint16_t
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+	buf_block_t*	undo_block,	/*!< in/out: undo log page */
+	byte*		ptr,		/*!< in: ptr up to where data has been
+					written on this undo page. */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  ut_ad(page_align(ptr) == undo_block->page.frame);
+
+  if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+    return 0;
+
+  byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+						TRX_UNDO_PAGE_FREE +
+						undo_block->page.frame);
+
+  const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
+
+  /* Write offset of the previous undo log record */
+  memcpy(ptr, ptr_to_first_free, 2);
+  ptr += 2;
+
+  const uint16_t end_of_rec= static_cast<uint16_t>
+    (ptr - undo_block->page.frame);
+
+  /* Update the offset to first free undo record */
+  mach_write_to_2(ptr_to_first_free, end_of_rec);
+  /* Write offset of the next undo log record */
+  memcpy(undo_block->page.frame + first_free, ptr_to_first_free, 2);
+  const byte *start= undo_block->page.frame + first_free + 2;
+
+  mtr->undo_append(*undo_block, start, ptr - start - 2);
+  return first_free;
+}
+
+/** Virtual column undo log version. To distinguish it from a length value
+in 5.7.8 undo log, it starts with 0xF1 */
+static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1;
+
+/** Write virtual column index info (index id and column position in index)
+to the undo log
+@param[in,out]	undo_block	undo log page
+@param[in]	table           the table
+@param[in]	pos		the virtual column position
+@param[in]      ptr             undo log record being written
+@param[in]	first_v_col	whether this is the first virtual column
+				which could start with a version marker
+@return new undo log pointer */
+static
+byte*
+trx_undo_log_v_idx(
+	buf_block_t*		undo_block,
+	const dict_table_t*	table,
+	ulint			pos,
+	byte*			ptr,
+	bool			first_v_col)
+{
+	ut_ad(pos < table->n_v_def);
+	dict_v_col_t*	vcol = dict_table_get_nth_v_col(table, pos);
+	byte*		old_ptr;
+
+	ut_ad(!vcol->v_indexes.empty());
+
+	ulint		size = first_v_col ? 1 + 2 : 2;
+	const ulint	avail = trx_undo_left(undo_block, ptr);
+
+	/* The mach_write_compressed(ptr, flen) in
+	trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */
+	if (avail < size + 5) {
+		return(NULL);
+	}
+
+	ulint n_idx = 0;
+	for (const auto& v_index : vcol->v_indexes) {
+		n_idx++;
+		/* FIXME: index->id is 64 bits! */
+		size += mach_get_compressed_size(uint32_t(v_index.index->id));
+		size += mach_get_compressed_size(v_index.nth_field);
+	}
+
+	size += mach_get_compressed_size(n_idx);
+
+	if (avail < size + 5) {
+		return(NULL);
+	}
+
+	ut_d(const byte* orig_ptr = ptr);
+
+	if (first_v_col) {
+		/* write the version marker */
+		mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1);
+
+		ptr += 1;
+	}
+
+	old_ptr = ptr;
+
+	ptr += 2;
+
+	ptr += mach_write_compressed(ptr, n_idx);
+
+	for (const auto& v_index : vcol->v_indexes) {
+		ptr += mach_write_compressed(
+			/* FIXME: index->id is 64 bits! */
+			ptr, uint32_t(v_index.index->id));
+
+		ptr += mach_write_compressed(ptr, v_index.nth_field);
+	}
+
+	ut_ad(orig_ptr + size == ptr);
+
+	mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+	return(ptr);
+}
+
+/** Read virtual column index from undo log, and verify the column is still
+indexed, and return its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[out]	col_pos		the column number or FIL_NULL
+				if the column is not indexed any more
+@return remaining part of undo log record after reading these values */
+static
+const byte*
+trx_undo_read_v_idx_low(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	uint32_t*		col_pos)
+{
+	ulint		len = mach_read_from_2(ptr);
+	const byte*	old_ptr = ptr;
+
+	*col_pos = FIL_NULL;
+
+	ptr += 2;
+
+	ulint	num_idx = mach_read_next_compressed(&ptr);
+
+	ut_ad(num_idx > 0);
+
+	dict_index_t*	clust_index = dict_table_get_first_index(table);
+
+	for (ulint i = 0; i < num_idx; i++) {
+		index_id_t	id = mach_read_next_compressed(&ptr);
+		ulint		pos = mach_read_next_compressed(&ptr);
+		dict_index_t*	index = dict_table_get_next_index(clust_index);
+
+		while (index != NULL) {
+			/* Return if we find a matching index.
+			TODO: in the future, it might be worth to add
+			checks on other indexes */
+			if (index->id == id) {
+				const dict_col_t* col = dict_index_get_nth_col(
+					index, pos);
+				ut_ad(col->is_virtual());
+				const dict_v_col_t*	vcol = reinterpret_cast<
+					const dict_v_col_t*>(col);
+				*col_pos = vcol->v_pos;
+				return(old_ptr + len);
+			}
+
+			index = dict_table_get_next_index(index);
+		}
+	}
+
+	return(old_ptr + len);
+}
+
+/** Read virtual column index from undo log or online log if the log
+contains such info, and in the undo log case, verify the column is
+still indexed, and output its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in]	first_v_col	if this is the first virtual column, which
+				has the version marker
+@param[in,out]	is_undo_log	this function is used to parse both undo log,
+				and online log for virtual columns. So
+				check to see if this is undo log. When
+				first_v_col is true, is_undo_log is output,
+				when first_v_col is false, is_undo_log is input
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	bool			first_v_col,
+	bool*			is_undo_log,
+	uint32_t*		field_no)
+{
+	/* Version marker only put on the first virtual column */
+	if (first_v_col) {
+		/* Undo log has the virtual undo log marker */
+		*is_undo_log = (mach_read_from_1(ptr)
+				== VIRTUAL_COL_UNDO_FORMAT_1);
+
+		if (*is_undo_log) {
+			ptr += 1;
+		}
+	}
+
+	if (*is_undo_log) {
+		ptr = trx_undo_read_v_idx_low(table, ptr, field_no);
+	} else {
+		*field_no -= REC_MAX_N_FIELDS;
+	}
+
+	return(ptr);
+}
+
+/** Reports in the undo log of an insert of virtual columns.
+@param[in]	undo_block	undo log page
+@param[in]	table		the table
+@param[in]	row		dtuple contains the virtual columns
+@param[in,out]	ptr		log ptr
+@return true if write goes well, false if out of space */
+static
+bool
+trx_undo_report_insert_virtual(
+	buf_block_t*	undo_block,
+	dict_table_t*	table,
+	const dtuple_t*	row,
+	byte**		ptr)
+{
+	byte*	start = *ptr;
+	bool	first_v_col = true;
+
+	if (trx_undo_left(undo_block, *ptr) < 2) {
+		return(false);
+	}
+
+	/* Reserve 2 bytes to write the number
+	of bytes the stored fields take in this
+	undo record */
+	*ptr += 2;
+
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+	     col_no++) {
+		const dict_v_col_t*     col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		if (col->m_col.ord_part) {
+
+			/* make sure enought space to write the length */
+			if (trx_undo_left(undo_block, *ptr) < 5) {
+				return(false);
+			}
+
+			ulint   pos = col_no;
+			pos += REC_MAX_N_FIELDS;
+			*ptr += mach_write_compressed(*ptr, pos);
+
+			*ptr = trx_undo_log_v_idx(undo_block, table,
+						  col_no, *ptr, first_v_col);
+			first_v_col = false;
+
+			if (*ptr == NULL) {
+				return(false);
+			}
+
+			const dfield_t* vfield = dtuple_get_nth_v_field(
+				row, col->v_pos);
+			switch (ulint flen = vfield->len) {
+			case 0: case UNIV_SQL_NULL:
+				if (trx_undo_left(undo_block, *ptr) < 5) {
+					return(false);
+				}
+
+				*ptr += mach_write_compressed(*ptr, flen);
+				break;
+			default:
+				ulint	max_len
+					= dict_max_v_field_len_store_undo(
+						table, col_no);
+
+				if (flen > max_len) {
+					flen = max_len;
+				}
+
+				if (trx_undo_left(undo_block, *ptr)
+				    < flen + 5) {
+					return(false);
+				}
+				*ptr += mach_write_compressed(*ptr, flen);
+
+				memcpy(*ptr, vfield->data, flen);
+				*ptr += flen;
+			}
+		}
+	}
+
+	/* Always mark the end of the log with 2 bytes length field */
+	mach_write_to_2(start, ulint(*ptr - start));
+
+	return(true);
+}
+
+/** Reports in the undo log of an insert of a clustered index record.
+@param	undo_block	undo log page
+@param	trx		transaction
+@param	index		clustered index
+@param	clust_entry	index entry which will be inserted to the
+			clustered index
+@param	mtr		mini-transaction
+@param	write_empty	write empty table undo log record
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_insert(
+	buf_block_t*	undo_block,
+	trx_t*		trx,
+	dict_index_t*	index,
+	const dtuple_t*	clust_entry,
+	mtr_t*		mtr,
+	bool		write_empty)
+{
+	ut_ad(index->is_primary());
+	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+	TRX_UNDO_INSERT == 1 into insert_undo pages,
+	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+			       + undo_block->page.frame) <= 2);
+
+	uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+					       (TRX_UNDO_PAGE_HDR
+						+ TRX_UNDO_PAGE_FREE
+						+ undo_block->page.frame));
+	byte* ptr = undo_block->page.frame + first_free;
+
+	if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
+		/* Not enough space for writing the general parameters */
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+	*ptr++ = TRX_UNDO_INSERT_REC;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+
+	if (write_empty) {
+		/* Table is in bulk operation */
+		undo_block->page.frame[first_free + 2] = TRX_UNDO_EMPTY;
+		goto done;
+	}
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the record
+	to be inserted in the clustered index */
+	if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
+		ut_ad(clust_entry->is_metadata());
+		ut_ad(index->is_instant());
+		ut_ad(undo_block->page.frame[first_free + 2]
+		      == TRX_UNDO_INSERT_REC);
+		undo_block->page.frame[first_free + 2]
+			= TRX_UNDO_INSERT_METADATA;
+		goto done;
+	}
+
+	for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		const dfield_t*	field	= dtuple_get_nth_field(clust_entry, i);
+		ulint		flen	= dfield_get_len(field);
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		switch (flen) {
+		case 0: case UNIV_SQL_NULL:
+			break;
+		default:
+			if (trx_undo_left(undo_block, ptr) < flen) {
+
+				return(0);
+			}
+
+			memcpy(ptr, dfield_get_data(field), flen);
+			ptr += flen;
+		}
+	}
+
+	if (index->table->n_v_cols) {
+		if (!trx_undo_report_insert_virtual(
+			undo_block, index->table, clust_entry, &ptr)) {
+			return(0);
+		}
+	}
+
+done:
+	return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	ulint*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+{
+	ulint		type_cmpl;
+
+	type_cmpl = undo_rec[2];
+	const byte *ptr = undo_rec + 3;
+
+	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+	ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+	ut_ad(*type <= TRX_UNDO_EMPTY);
+	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+	*undo_no = mach_read_next_much_compressed(&ptr);
+	*table_id = mach_read_next_much_compressed(&ptr);
+	ut_ad(*table_id);
+
+	return ptr;
+}
+
+/** Read from an undo log record a non-virtual column value.
+@param ptr	pointer to remaining part of the undo record
+@param field	stored field
+@param len	length of the field, or UNIV_SQL_NULL
+@param orig_len	original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                                     uint32_t *len, uint32_t *orig_len)
+{
+	*len = mach_read_next_compressed(&ptr);
+	*orig_len = 0;
+
+	switch (*len) {
+	case UNIV_SQL_NULL:
+		*field = NULL;
+		break;
+	case UNIV_EXTERN_STORAGE_FIELD:
+		*orig_len = mach_read_next_compressed(&ptr);
+		*len = mach_read_next_compressed(&ptr);
+		*field = ptr;
+		ptr += *len & ~SPATIAL_STATUS_MASK;
+
+		ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_ad(*len > *orig_len);
+		/* @see dtuple_convert_big_rec() */
+		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* we do not have access to index->table here
+		ut_ad(dict_table_has_atomic_blobs(index->table)
+		      || *len >= col->max_prefix
+		      + BTR_EXTERN_FIELD_REF_SIZE);
+		*/
+
+		*len += UNIV_EXTERN_STORAGE_FIELD;
+		break;
+	default:
+		*field = ptr;
+		if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+			ptr += (*len - UNIV_EXTERN_STORAGE_FIELD)
+				& ~SPATIAL_STATUS_MASK;
+		} else {
+			ptr += *len;
+		}
+	}
+
+	return ptr;
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+const byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t**ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	ut_ad(index->is_primary());
+
+	const ulint ref_len = dict_index_get_n_unique(index);
+
+	dtuple_t* tuple = dtuple_create(heap, ref_len);
+	*ref = tuple;
+
+	dict_index_copy_types(tuple, index, ref_len);
+
+	for (ulint i = 0; i < ref_len; i++) {
+		const byte*	field;
+		uint32_t	len, orig_len;
+
+		dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	return ptr;
+}
+
+/** Skip a row reference from an undo log record.
+@param ptr    part of an update undo log record
+@param index  clustered index
+@return pointer to remaining part of undo record */
+static const byte *trx_undo_rec_skip_row_ref(const byte *ptr,
+                                             const dict_index_t *index)
+{
+	ut_ad(index->is_primary());
+
+	ulint ref_len = dict_index_get_n_unique(index);
+
+	for (ulint i = 0; i < ref_len; i++) {
+		const byte*	field;
+		uint32_t len, orig_len;
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+	}
+
+	return(ptr);
+}
+
+/** Fetch a prefix of an externally stored column, for writing to the undo
+log of an update or delete marking of a clustered index record.
+@param[out]	ext_buf		buffer to hold the prefix data and BLOB pointer
+@param[in]	prefix_len	prefix size to store in the undo log
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	field		an externally stored column
+@param[in,out]	len		input: length of field; output: used length of
+ext_buf
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+	byte*			ext_buf,
+	ulint			prefix_len,
+	ulint			zip_size,
+	const byte*		field,
+	ulint*			len)
+{
+	/* Fetch the BLOB. */
+	ulint	ext_len = btr_copy_externally_stored_field_prefix(
+		ext_buf, prefix_len, zip_size, field, *len);
+	/* BLOBs should always be nonempty. */
+	ut_a(ext_len);
+	/* Append the BLOB pointer to the prefix. */
+	memcpy(ext_buf + ext_len,
+	       field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+	       BTR_EXTERN_FIELD_REF_SIZE);
+	*len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+	return(ext_buf);
+}
+
+/** Writes to the undo log a prefix of an externally stored column.
+@param[out]	ptr		undo log position, at least 15 bytes must be
+available
+@param[out]	ext_buf		a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT()
+				size, or NULL when should not fetch a longer
+				prefix
+@param[in]	prefix_len	prefix size to store in the undo log
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	field		the locally stored part of the externally
+stored column
+@param[in,out]	len		length of field, in bytes
+@param[in]	spatial_status	whether the column is used by spatial index or
+				regular index
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+	byte*			ptr,
+	byte*			ext_buf,
+	ulint			prefix_len,
+	ulint			zip_size,
+	const byte**		field,
+	ulint*			len,
+	spatial_status_t	spatial_status)
+{
+	ulint	spatial_len= 0;
+
+	switch (spatial_status) {
+	case SPATIAL_UNKNOWN:
+	case SPATIAL_NONE:
+		break;
+
+	case SPATIAL_MIXED:
+	case SPATIAL_ONLY:
+		spatial_len = DATA_MBR_LEN;
+		break;
+	}
+
+	/* Encode spatial status into length. */
+	spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT;
+
+	if (spatial_status == SPATIAL_ONLY) {
+		/* If the column is only used by gis index, log its
+		MBR is enough.*/
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + spatial_len);
+
+		return(ptr);
+	}
+
+	if (ext_buf) {
+		ut_a(prefix_len > 0);
+
+		/* If an ordering column is externally stored, we will
+		have to store a longer prefix of the field.  In this
+		case, write to the log a marker followed by the
+		original length and the real length of the field. */
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+		ptr += mach_write_compressed(ptr, *len);
+
+		*field = trx_undo_page_fetch_ext(ext_buf, prefix_len,
+						 zip_size, *field, len);
+
+		ptr += mach_write_compressed(ptr, *len + spatial_len);
+	} else {
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + *len + spatial_len);
+	}
+
+	return(ptr);
+}
+
+/** Get MBR from a Geometry column stored externally
+@param[out]	mbr		MBR to fill
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	field		field contain the geometry data
+@param[in,out]	len		length of field, in bytes
+*/
+static
+void
+trx_undo_get_mbr_from_ext(
+/*======================*/
+	double*		mbr,
+	ulint		zip_size,
+	const byte*	field,
+	ulint*		len)
+{
+	uchar*		dptr = NULL;
+	ulint		dlen;
+	mem_heap_t*	heap = mem_heap_create(100);
+
+	dptr = btr_copy_externally_stored_field(
+		&dlen, field, zip_size, *len, heap);
+
+	if (dlen <= GEO_DATA_HEADER_SIZE) {
+		for (uint i = 0; i < SPDIMS; ++i) {
+			mbr[i * 2] = DBL_MAX;
+			mbr[i * 2 + 1] = -DBL_MAX;
+		}
+	} else {
+		rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+				   static_cast<uint>(dlen
+				   - GEO_DATA_HEADER_SIZE), SPDIMS, mbr);
+	}
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_modify(
+/*========================*/
+	buf_block_t*	undo_block,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index where update or
+					delete marking is done */
+	const rec_t*	rec,		/*!< in: clustered index record which
+					has NOT yet been modified */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,		/*!< in: update vector which tells the
+					columns to be updated; in the case of
+					a delete, this should be set to NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const dtuple_t*	row,		/*!< in: clustered index row contains
+					virtual column info */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(index->is_primary());
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+	TRX_UNDO_INSERT == 1 into insert_undo pages,
+	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+			       + undo_block->page.frame) <= 2);
+
+	byte* ptr_to_first_free = my_assume_aligned<2>(
+		TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		+ undo_block->page.frame);
+
+	const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+	byte *ptr = undo_block->page.frame + first_free;
+
+	if (trx_undo_left(undo_block, ptr) < 50) {
+		/* NOTE: the value 50 must be big enough so that the general
+		fields written below fit on the undo log page */
+		return 0;
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	dict_table_t*	table		= index->table;
+	const byte*	field;
+	ulint		flen;
+	ulint		col_no;
+	ulint		type_cmpl;
+	byte*		type_cmpl_ptr;
+	ulint		i;
+	trx_id_t	trx_id;
+	ibool		ignore_prefix = FALSE;
+	byte		ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+				+ BTR_EXTERN_FIELD_REF_SIZE];
+	bool		first_v_col = true;
+
+	/* Store first some general parameters to the undo log */
+
+	if (!update) {
+		ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table)));
+		type_cmpl = TRX_UNDO_DEL_MARK_REC;
+	} else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing update_undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+		type_cmpl = TRX_UNDO_UPD_DEL_REC;
+		/* We are about to update a delete marked record.
+		We don't typically need the prefix in this case unless
+		the delete marking is done by the same transaction
+		(which we check below). */
+		ignore_prefix = TRUE;
+	} else {
+		type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+	}
+
+	type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+	type_cmpl_ptr = ptr;
+
+	*ptr++ = (byte) type_cmpl;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+
+	ptr += mach_u64_write_much_compressed(ptr, table->id);
+
+	/*----------------------------------------*/
+	/* Store the state of the info bits */
+
+	*ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+	/* Store the values of the system columns */
+	field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen);
+	ut_ad(flen == DATA_TRX_ID_LEN);
+
+	trx_id = trx_read_trx_id(field);
+
+	/* If it is an update of a delete marked record, then we are
+	allowed to ignore blob prefixes if the delete marking was done
+	by some other trx as it must have committed by now for us to
+	allow an over-write. */
+	if (trx_id == trx->id) {
+		ignore_prefix = false;
+	}
+	ptr += mach_u64_write_compressed(ptr, trx_id);
+
+	field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen);
+	ut_ad(flen == DATA_ROLL_PTR_LEN);
+	ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN));
+
+	ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field));
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the
+	record which will be modified in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		/* The ordering columns must not be instant added columns. */
+		ut_ad(!rec_offs_nth_default(offsets, i));
+		field = rec_get_nth_field(rec, offsets, i, &flen);
+
+		/* The ordering columns must not be stored externally. */
+		ut_ad(!rec_offs_nth_extern(offsets, i));
+		ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_block, ptr) < flen) {
+				return(0);
+			}
+
+			memcpy(ptr, field, flen);
+			ptr += flen;
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Save to the undo log the old values of the columns to be updated. */
+
+	if (update) {
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		ulint	n_updated = upd_get_n_fields(update);
+
+		/* If this is an online update while an inplace alter table
+		is in progress and the table has virtual column, we will
+		need to double check if there are any non-indexed columns
+		being registered in update vector in case they will be indexed
+		in new table */
+		if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) {
+			for (i = 0; i < upd_get_n_fields(update); i++) {
+				upd_field_t*	fld = upd_get_nth_field(
+					update, i);
+				ulint		pos = fld->field_no;
+
+				/* These columns must not have an index
+				on them */
+				if (upd_fld_is_virtual_col(fld)
+				    && dict_table_get_nth_v_col(
+					    table, pos)->v_indexes.empty()) {
+					n_updated--;
+				}
+			}
+		}
+
+		i = 0;
+
+		if (UNIV_UNLIKELY(update->is_alter_metadata())) {
+			ut_ad(update->n_fields >= 1);
+			ut_ad(!upd_fld_is_virtual_col(&update->fields[0]));
+			ut_ad(update->fields[0].field_no
+			      == index->first_user_field());
+			ut_ad(!dfield_is_ext(&update->fields[0].new_val));
+			ut_ad(!dfield_is_null(&update->fields[0].new_val));
+			/* The instant ADD COLUMN metadata record does not
+			contain the BLOB. Do not write anything for it. */
+			i = !rec_is_alter_metadata(rec, *index);
+			n_updated -= i;
+		}
+
+		ptr += mach_write_compressed(ptr, n_updated);
+
+		for (; i < upd_get_n_fields(update); i++) {
+			if (trx_undo_left(undo_block, ptr) < 5) {
+				return 0;
+			}
+
+			upd_field_t*	fld = upd_get_nth_field(update, i);
+
+			bool	is_virtual = upd_fld_is_virtual_col(fld);
+			ulint	max_v_log_len = 0;
+
+			ulint pos = fld->field_no;
+			const dict_col_t* col = NULL;
+
+			if (is_virtual) {
+				/* Skip the non-indexed column, during
+				an online alter table */
+				if (dict_index_is_online_ddl(index)
+				    && dict_table_get_nth_v_col(
+					table, pos)->v_indexes.empty()) {
+					continue;
+				}
+
+				/* add REC_MAX_N_FIELDS to mark this
+				is a virtual col */
+				ptr += mach_write_compressed(
+					ptr, pos + REC_MAX_N_FIELDS);
+
+				if (trx_undo_left(undo_block, ptr) < 15) {
+					return 0;
+				}
+
+				ut_ad(fld->field_no < table->n_v_def);
+
+				ptr = trx_undo_log_v_idx(undo_block, table,
+							 fld->field_no, ptr,
+							 first_v_col);
+				if (ptr == NULL) {
+					 return(0);
+				}
+				first_v_col = false;
+
+				max_v_log_len
+					= dict_max_v_field_len_store_undo(
+						table, fld->field_no);
+
+				field = static_cast<byte*>(
+					fld->old_v_val->data);
+				flen = fld->old_v_val->len;
+
+				/* Only log sufficient bytes for index
+				record update */
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				goto store_len;
+			}
+
+			if (UNIV_UNLIKELY(update->is_metadata())) {
+				ut_ad(pos >= index->first_user_field());
+				ut_ad(rec_is_metadata(rec, *index));
+
+				if (rec_is_alter_metadata(rec, *index)) {
+					ut_ad(update->is_alter_metadata());
+
+					field = rec_offs_n_fields(offsets)
+						> pos
+						&& !rec_offs_nth_default(
+							offsets, pos)
+						? rec_get_nth_field(
+							rec, offsets,
+							pos, &flen)
+						: index->instant_field_value(
+							pos - 1, &flen);
+
+					if (pos == index->first_user_field()) {
+						ut_ad(rec_offs_nth_extern(
+							offsets, pos));
+						ut_ad(flen == FIELD_REF_SIZE);
+						goto write_field;
+					}
+					col = dict_index_get_nth_col(index,
+								     pos - 1);
+				} else if (!update->is_alter_metadata()) {
+					goto get_field;
+				} else {
+					/* We are converting an ADD COLUMN
+					metadata record to an ALTER TABLE
+					metadata record, with BLOB. Subtract
+					the missing metadata BLOB field. */
+					ut_ad(pos > index->first_user_field());
+					--pos;
+					goto get_field;
+				}
+			} else {
+get_field:
+				col = dict_index_get_nth_col(index, pos);
+				field = rec_get_nth_cfield(
+					rec, index, offsets, pos, &flen);
+			}
+write_field:
+			/* Write field number to undo log */
+			ptr += mach_write_compressed(ptr, pos);
+
+			if (trx_undo_left(undo_block, ptr) < 15) {
+				return 0;
+			}
+
+			if (rec_offs_n_fields(offsets) > pos
+			    && rec_offs_nth_extern(offsets, pos)) {
+				ut_ad(col || pos == index->first_user_field());
+				ut_ad(col || update->is_alter_metadata());
+				ut_ad(col
+				      || rec_is_alter_metadata(rec, *index));
+				ulint prefix_len = col
+					? dict_max_field_len_store_undo(
+						table, col)
+					: 0;
+
+				ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+				      <= sizeof ext_buf);
+
+				ptr = trx_undo_page_report_modify_ext(
+					ptr,
+					col
+					&& col->ord_part
+					&& !ignore_prefix
+					&& flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+					? ext_buf : NULL, prefix_len,
+					table->space->zip_size(),
+					&field, &flen, SPATIAL_UNKNOWN);
+
+				*type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+			} else {
+store_len:
+				ptr += mach_write_compressed(ptr, flen);
+			}
+
+			if (flen != UNIV_SQL_NULL) {
+				if (trx_undo_left(undo_block, ptr) < flen) {
+					return(0);
+				}
+
+				memcpy(ptr, field, flen);
+				ptr += flen;
+			}
+
+			/* Also record the new value for virtual column */
+			if (is_virtual) {
+				field = static_cast<byte*>(fld->new_val.data);
+				flen = fld->new_val.len;
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				if (trx_undo_left(undo_block, ptr) < 15) {
+					return(0);
+				}
+
+				ptr += mach_write_compressed(ptr, flen);
+
+				if (flen != UNIV_SQL_NULL) {
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+	}
+
+	/* Reset the first_v_col, so to put the virtual column undo
+	version marker again, when we log all the indexed columns */
+	first_v_col = true;
+
+	/*----------------------------------------*/
+	/* In the case of a delete marking, and also in the case of an update
+	where any ordering field of any index changes, store the values of all
+	columns which occur as ordering fields in any index. This info is used
+	in the purge of old versions where we use it to build and search the
+	delete marked index records, to look if we can remove them from the
+	index tree. Note that starting from 4.0.14 also externally stored
+	fields can be ordering in some index. Starting from 5.2, we no longer
+	store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+	but we can construct the column prefix fields in the index by
+	fetching the first page of the BLOB that is pointed to by the
+	clustered index. This works also in crash recovery, because all pages
+	(including BLOBs) are recovered before anything is rolled back. */
+
+	if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		byte*		old_ptr = ptr;
+		double		mbr[SPDIMS * 2];
+		mem_heap_t*	row_heap = NULL;
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		/* Reserve 2 bytes to write the number of bytes the stored
+		fields take in this undo record */
+
+		ptr += 2;
+
+		for (col_no = 0; col_no < dict_table_get_n_cols(table);
+		     col_no++) {
+
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, col_no);
+
+			if (!col->ord_part) {
+				continue;
+			}
+
+			const ulint pos = dict_index_get_nth_col_pos(
+				index, col_no, NULL);
+			/* All non-virtual columns must be present in
+			the clustered index. */
+			ut_ad(pos != ULINT_UNDEFINED);
+
+			const bool is_ext = rec_offs_nth_extern(offsets, pos);
+			const spatial_status_t spatial_status = is_ext
+				? dict_col_get_spatial_status(col)
+				: SPATIAL_NONE;
+
+			switch (spatial_status) {
+			case SPATIAL_UNKNOWN:
+				ut_ad(0);
+				/* fall through */
+			case SPATIAL_MIXED:
+			case SPATIAL_ONLY:
+				/* Externally stored spatially indexed
+				columns will be (redundantly) logged
+				again, because we did not write the
+				MBR yet, that is, the previous call to
+				trx_undo_page_report_modify_ext()
+				was with SPATIAL_UNKNOWN. */
+				break;
+			case SPATIAL_NONE:
+				if (!update) {
+					/* This is a DELETE operation. */
+					break;
+				}
+				/* Avoid redundantly logging indexed
+				columns that were updated. */
+
+				for (i = 0; i < update->n_fields; i++) {
+					const ulint field_no
+						= upd_get_nth_field(update, i)
+						->field_no;
+					if (field_no >= index->n_fields
+					    || dict_index_get_nth_field(
+						    index, field_no)->col
+					    == col) {
+						goto already_logged;
+					}
+				}
+			}
+
+			if (true) {
+				/* Write field number to undo log */
+				if (trx_undo_left(undo_block, ptr) < 5 + 15) {
+					return(0);
+				}
+
+				ptr += mach_write_compressed(ptr, pos);
+
+				/* Save the old value of field */
+				field = rec_get_nth_cfield(
+					rec, index, offsets, pos, &flen);
+
+				if (is_ext) {
+					const dict_col_t*	col =
+						dict_index_get_nth_col(
+							index, pos);
+					ulint			prefix_len =
+						dict_max_field_len_store_undo(
+							table, col);
+
+					ut_a(prefix_len < sizeof ext_buf);
+					const ulint zip_size
+						= table->space->zip_size();
+
+					/* If there is a spatial index on it,
+					log its MBR */
+					if (spatial_status != SPATIAL_NONE) {
+						ut_ad(DATA_GEOMETRY_MTYPE(
+								col->mtype));
+
+						trx_undo_get_mbr_from_ext(
+							mbr, zip_size,
+							field, &flen);
+					}
+
+					ptr = trx_undo_page_report_modify_ext(
+						ptr,
+						flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+						&& !ignore_prefix
+						? ext_buf : NULL, prefix_len,
+						zip_size,
+						&field, &flen,
+						spatial_status);
+				} else {
+					ptr += mach_write_compressed(
+						ptr, flen);
+				}
+
+				if (flen != UNIV_SQL_NULL
+				    && spatial_status != SPATIAL_ONLY) {
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+
+				if (spatial_status != SPATIAL_NONE) {
+					if (trx_undo_left(undo_block, ptr)
+					    < DATA_MBR_LEN) {
+						return(0);
+					}
+
+					for (int i = 0; i < SPDIMS * 2;
+					     i++) {
+						mach_double_write(
+							ptr, mbr[i]);
+						ptr +=  sizeof(double);
+					}
+				}
+			}
+
+already_logged:
+			continue;
+		}
+
+		for (col_no = 0; col_no < dict_table_get_n_v_cols(table);
+		     col_no++) {
+			const dict_v_col_t*     col
+				= dict_table_get_nth_v_col(table, col_no);
+
+			if (col->m_col.ord_part) {
+				ulint   pos = col_no;
+				ulint	max_v_log_len
+					= dict_max_v_field_len_store_undo(
+						table, pos);
+
+				/* Write field number to undo log.
+				Make sure there is enought space in log */
+				if (trx_undo_left(undo_block, ptr) < 5) {
+					return(0);
+				}
+
+				pos += REC_MAX_N_FIELDS;
+				ptr += mach_write_compressed(ptr, pos);
+
+				ut_ad(col_no < table->n_v_def);
+				ptr = trx_undo_log_v_idx(undo_block, table,
+							 col_no, ptr,
+							 first_v_col);
+				first_v_col = false;
+
+				if (!ptr) {
+					 return(0);
+				}
+
+				const dfield_t* vfield = NULL;
+
+				if (update) {
+					ut_ad(!row);
+					if (update->old_vrow == NULL) {
+						flen = UNIV_SQL_NULL;
+					} else {
+						vfield = dtuple_get_nth_v_field(
+							update->old_vrow,
+							col->v_pos);
+					}
+				} else if (row) {
+					vfield = dtuple_get_nth_v_field(
+						row, col->v_pos);
+				} else {
+					ut_ad(0);
+				}
+
+				if (vfield) {
+					field = static_cast<byte*>(vfield->data);
+					flen = vfield->len;
+				} else {
+					ut_ad(flen == UNIV_SQL_NULL);
+				}
+
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				ptr += mach_write_compressed(ptr, flen);
+
+				switch (flen) {
+				case 0: case UNIV_SQL_NULL:
+					break;
+				default:
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+
+		mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+		if (row_heap) {
+			mem_heap_free(row_heap);
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Write pointers to the previous and the next undo log records */
+	if (trx_undo_left(undo_block, ptr) < 2) {
+		return(0);
+	}
+
+	mach_write_to_2(ptr, first_free);
+	const uint16_t new_free = static_cast<uint16_t>(
+		ptr + 2 - undo_block->page.frame);
+	mach_write_to_2(undo_block->page.frame + first_free, new_free);
+
+	mach_write_to_2(ptr_to_first_free, new_free);
+
+	const byte* start = &undo_block->page.frame[first_free + 2];
+	mtr->undo_append(*undo_block, start, ptr - start);
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	const byte*	ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	byte*		info_bits)	/*!< out: info bits state */
+{
+	/* Read the state of the info bits */
+	*info_bits = *ptr++;
+
+	/* Read the values of the system columns */
+
+	*trx_id = mach_u64_read_next_compressed(&ptr);
+	*roll_ptr = mach_u64_read_next_compressed(&ptr);
+
+	return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	const byte*	ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undo record */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd)	/*!< out, own: update vector */
+{
+	upd_field_t*	upd_field;
+	upd_t*		update;
+	ulint		n_fields;
+	byte*		buf;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+	ulint		n_skip_field = 0;
+
+	ut_a(dict_index_is_clust(index));
+
+	if (type != TRX_UNDO_DEL_MARK_REC) {
+		n_fields = mach_read_next_compressed(&ptr);
+	} else {
+		n_fields = 0;
+	}
+
+	*upd = update = upd_create(n_fields + 2, heap);
+
+	update->info_bits = info_bits;
+
+	/* Store first trx id and roll ptr to update vector */
+
+	upd_field = upd_get_nth_field(update, n_fields);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+	mach_write_to_6(buf, trx_id);
+
+	upd_field_set_field_no(upd_field, index->db_trx_id(), index);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+	upd_field = upd_get_nth_field(update, n_fields + 1);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+	trx_write_roll_ptr(buf, roll_ptr);
+
+	upd_field_set_field_no(upd_field, index->db_roll_ptr(), index);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+	/* Store then the updated ordinary columns to the update vector */
+
+	for (ulint i = 0; i < n_fields; i++) {
+		const byte* field;
+		uint32_t len, orig_len;
+
+		upd_field = upd_get_nth_field(update, i);
+		uint32_t field_no = mach_read_next_compressed(&ptr);
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			/* If new version, we need to check index list to figure
+			out the correct virtual column position */
+			ptr = trx_undo_read_v_idx(
+				index->table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+			/* This column could be dropped or no longer indexed */
+			if (field_no >= index->n_fields) {
+				/* Mark this is no longer needed */
+				upd_field->field_no = REC_MAX_N_FIELDS;
+
+				ptr = trx_undo_rec_get_col_val(
+					ptr, &field, &len, &orig_len);
+				ptr = trx_undo_rec_get_col_val(
+					ptr, &field, &len, &orig_len);
+				n_skip_field++;
+				continue;
+			}
+
+			upd_field_set_v_field_no(
+				upd_field, static_cast<uint16_t>(field_no),
+				index);
+		} else if (UNIV_UNLIKELY((update->info_bits
+					  & ~REC_INFO_DELETED_FLAG)
+					 == REC_INFO_MIN_REC_FLAG)) {
+			ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
+			const uint32_t uf = index->first_user_field();
+			ut_ad(field_no >= uf);
+
+			if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
+				/* Generic instant ALTER TABLE */
+				if (field_no == uf) {
+					upd_field->new_val.type
+						.metadata_blob_init();
+				} else if (field_no >= index->n_fields) {
+					/* This is reachable during
+					purge if the table was emptied
+					and converted to the canonical
+					format on a later ALTER TABLE.
+					In this case,
+					row_purge_upd_exist_or_extern()
+					would only be interested in
+					freeing any BLOBs that were
+					updated, that is, the metadata
+					BLOB above.  Other BLOBs in
+					the metadata record are never
+					updated; they are for the
+					initial DEFAULT values of the
+					instantly added columns, and
+					they will never change.
+
+					Note: if the table becomes
+					empty during ROLLBACK or is
+					empty during subsequent ALTER
+					TABLE, and btr_page_empty() is
+					called to re-create the root
+					page without the metadata
+					record, in that case we should
+					only free the latest version
+					of BLOBs in the record,
+					which purge would never touch. */
+					field_no = REC_MAX_N_FIELDS;
+					n_skip_field++;
+				} else {
+					dict_col_copy_type(
+						dict_index_get_nth_col(
+							index, field_no - 1),
+						&upd_field->new_val.type);
+				}
+			} else {
+				/* Instant ADD COLUMN...LAST */
+				dict_col_copy_type(
+					dict_index_get_nth_col(index,
+							       field_no),
+					&upd_field->new_val.type);
+			}
+			upd_field->field_no = field_no
+				& dict_index_t::MAX_N_FIELDS;
+		} else if (field_no < index->n_fields) {
+			upd_field_set_field_no(upd_field,
+					       static_cast<uint16_t>(field_no),
+					       index);
+		} else {
+			ib::error() << "Trying to access update undo rec"
+				" field " << field_no
+				<< " in index " << index->name
+				<< " of table " << index->table->name
+				<< " but index has only "
+				<< dict_index_get_n_fields(index)
+				<< " fields " << BUG_REPORT_MSG
+				<< ". Run also CHECK TABLE "
+				<< index->table->name << "."
+				" n_fields = " << n_fields << ", i = " << i;
+
+			ut_ad(0);
+			*upd = NULL;
+			return(NULL);
+		}
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		upd_field->orig_len = static_cast<uint16_t>(orig_len);
+
+		if (len == UNIV_SQL_NULL) {
+			dfield_set_null(&upd_field->new_val);
+		} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_data(&upd_field->new_val, field, len);
+		} else {
+			len -= UNIV_EXTERN_STORAGE_FIELD;
+
+			dfield_set_data(&upd_field->new_val, field, len);
+			dfield_set_ext(&upd_field->new_val);
+		}
+
+		ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG
+					    | REC_INFO_MIN_REC_FLAG)
+		      || field_no != index->first_user_field()
+		      || (upd_field->new_val.ext
+			  && upd_field->new_val.len == FIELD_REF_SIZE));
+
+		if (is_virtual) {
+			upd_field->old_v_val = static_cast<dfield_t*>(
+				mem_heap_alloc(
+					heap, sizeof *upd_field->old_v_val));
+			ptr = trx_undo_rec_get_col_val(
+				ptr, &field, &len, &orig_len);
+	                if (len == UNIV_SQL_NULL) {
+				dfield_set_null(upd_field->old_v_val);
+			} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+				dfield_set_data(
+					upd_field->old_v_val, field, len);
+			} else {
+				ut_ad(0);
+			}
+		}
+	}
+
+	/* We may have to skip dropped indexed virtual columns.
+	Also, we may have to trim the update vector of a metadata record
+	if dict_index_t::clear_instant_alter() was invoked on the table
+	later, and the number of fields no longer matches. */
+
+	if (n_skip_field) {
+		upd_field_t* d = upd_get_nth_field(update, 0);
+		const upd_field_t* const end = d + n_fields + 2;
+
+		for (const upd_field_t* s = d; s != end; s++) {
+			if (s->field_no != REC_MAX_N_FIELDS) {
+				*d++ = *s;
+			}
+		}
+
+		ut_ad(d + n_skip_field == end);
+		update->n_fields = d - upd_get_nth_field(update, 0);
+	}
+
+	return(const_cast<byte*>(ptr));
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@param[in,out]	block	undo page
+@param[in,out]	mtr	mini-transaction
+@return	byte offset of the undo log record
+@retval	0	in case of failure */
+static
+uint16_t
+trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
+			    buf_block_t* block, mtr_t* mtr)
+{
+	byte*	ptr_first_free  = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+						       + TRX_UNDO_PAGE_FREE
+						       + block->page.frame);
+	const uint16_t first_free = mach_read_from_2(ptr_first_free);
+	ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+	ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+	byte* const start = block->page.frame + first_free;
+	size_t len = strlen(table->name.m_name);
+	const size_t fixed = 2 + 1 + 11 + 11 + 2;
+	ut_ad(len <= NAME_CHAR_LEN * 5 * 2 + 1);
+	/* The -10 is used in trx_undo_left() */
+	compile_time_assert(NAME_CHAR_LEN * 5 * 2 + fixed
+			    + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
+			    < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
+
+	if (trx_undo_left(block, start) < fixed + len) {
+		ut_ad(first_free > TRX_UNDO_PAGE_HDR
+		      + TRX_UNDO_PAGE_HDR_SIZE);
+		return 0;
+	}
+
+	byte* ptr = start + 2;
+	*ptr++ = TRX_UNDO_RENAME_TABLE;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_u64_write_much_compressed(ptr, table->id);
+	memcpy(ptr, table->name.m_name, len);
+	ptr += len;
+	mach_write_to_2(ptr, first_free);
+	mach_write_to_2(ptr_first_free, ptr + 2 - block->page.frame);
+	memcpy(start, ptr_first_free, 2);
+	mtr->undo_append(*block, start + 2, ptr - start - 2);
+	return first_free;
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@return	DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+{
+	ut_ad(!trx->read_only);
+	ut_ad(trx->id);
+	ut_ad(!table->is_temporary());
+
+	mtr_t		mtr;
+	dberr_t		err;
+	mtr.start();
+	if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) {
+		trx_undo_t*	undo = trx->rsegs.m_redo.undo;
+		ut_ad(err == DB_SUCCESS);
+		ut_ad(undo);
+		for (ut_d(int loop_count = 0);;) {
+			ut_ad(loop_count++ < 2);
+			ut_ad(undo->last_page_no
+			      == block->page.id().page_no());
+
+			if (uint16_t offset = trx_undo_page_report_rename(
+				    trx, table, block, &mtr)) {
+				undo->top_page_no = undo->last_page_no;
+				undo->top_offset  = offset;
+				undo->top_undo_no = trx->undo_no++;
+				undo->guess_block = block;
+				ut_ad(!undo->empty());
+
+				err = DB_SUCCESS;
+				break;
+			} else {
+				mtr.commit();
+				mtr.start();
+				block = trx_undo_add_page(undo, &mtr, &err);
+				if (!block) {
+					break;
+				}
+			}
+		}
+	}
+
+	mtr.commit();
+	return err;
+}
+
+TRANSACTIONAL_TARGET ATTRIBUTE_NOINLINE
+/** @return whether the transaction holds an exclusive lock on a table */
+static bool trx_has_lock_x(const trx_t &trx, dict_table_t& table)
+{
+  if (table.is_temporary())
+    return true;
+
+  uint32_t n;
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (table.lock_mutex_is_locked())
+      xabort();
+    n= table.n_lock_x_or_s;
+    xend();
+  }
+  else
+#endif
+  {
+    table.lock_mutex_lock();
+    n= table.n_lock_x_or_s;
+    table.lock_mutex_unlock();
+  }
+
+  /* This thread is executing trx. No other thread can modify our table locks
+  (only record locks might be created, in an implicit-to-explicit conversion).
+  Hence, no mutex is needed here. */
+  if (n)
+    for (const lock_t *lock : trx.lock.table_locks)
+      if (lock && lock->type_mode == (LOCK_X | LOCK_TABLE))
+        return true;
+
+  return false;
+}
+
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index; in updates,
+					may contain a clustered index
+					record tuple that also contains
+					virtual columns of the table;
+					otherwise, NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index; NULL if insert */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: DB_ROLL_PTR to the
+					undo log record */
+{
+	trx_t*		trx;
+#ifdef UNIV_DEBUG
+	int		loop_count	= 0;
+#endif /* UNIV_DEBUG */
+
+	ut_a(dict_index_is_clust(index));
+	ut_ad(!update || rec);
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!srv_read_only_mode);
+
+	trx = thr_get_trx(thr);
+	/* This function must not be invoked during rollback
+	(of a TRX_STATE_PREPARE transaction or otherwise). */
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(!trx->in_rollback);
+
+	/* We must determine if this is the first time when this
+	transaction modifies this table. */
+	auto m = trx->mod_tables.emplace(index->table, trx->undo_no);
+	ut_ad(m.first->second.valid(trx->undo_no));
+
+	if (m.second && index->table->is_active_ddl()) {
+		trx->apply_online_log= true;
+	}
+
+	bool bulk = !rec;
+
+	if (!bulk) {
+		/* An UPDATE or DELETE must not be covered by an
+		earlier start_bulk_insert(). */
+		ut_ad(!m.first->second.is_bulk_insert());
+	} else if (m.first->second.is_bulk_insert()) {
+		/* Above, the emplace() tried to insert an object with
+		!is_bulk_insert(). Only an explicit start_bulk_insert()
+		(below) can set the flag. */
+		ut_ad(!m.second);
+		/* We already wrote a TRX_UNDO_EMPTY record. */
+		ut_ad(thr->run_node);
+		ut_ad(que_node_get_type(thr->run_node) == QUE_NODE_INSERT);
+		ut_ad(trx->bulk_insert);
+		return DB_SUCCESS;
+	} else if (m.second && trx->bulk_insert
+		   && trx_has_lock_x(*trx, *index->table)) {
+		m.first->second.start_bulk_insert();
+	} else {
+		bulk = false;
+	}
+
+	mtr_t		mtr;
+	mtr.start();
+	trx_undo_t**	pundo;
+	trx_rseg_t*	rseg;
+	const bool	is_temp	= index->table->is_temporary();
+
+	if (is_temp) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+		rseg = trx->get_temp_rseg();
+		pundo = &trx->rsegs.m_noredo.undo;
+	} else {
+		ut_ad(!trx->read_only);
+		ut_ad(trx->id);
+		pundo = &trx->rsegs.m_redo.undo;
+		rseg = trx->rsegs.m_redo.rseg;
+	}
+
+	dberr_t		err;
+	buf_block_t*	undo_block = trx_undo_assign_low(trx, rseg, pundo,
+							 &err, &mtr);
+	trx_undo_t*	undo	= *pundo;
+	ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
+	if (UNIV_UNLIKELY(undo_block == NULL)) {
+err_exit:
+		mtr.commit();
+		return err;
+	}
+
+	ut_ad(undo != NULL);
+
+	do {
+		uint16_t offset = !rec
+			? trx_undo_page_report_insert(
+				undo_block, trx, index, clust_entry, &mtr,
+				bulk)
+			: trx_undo_page_report_modify(
+				undo_block, trx, index, rec, offsets, update,
+				cmpl_info, clust_entry, &mtr);
+
+		if (UNIV_UNLIKELY(offset == 0)) {
+			const uint16_t first_free = mach_read_from_2(
+				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				+ undo_block->page.frame);
+			memset(undo_block->page.frame + first_free, 0,
+			       (srv_page_size - FIL_PAGE_DATA_END)
+			       - first_free);
+
+			if (first_free
+			    == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+				/* The record did not fit on an empty
+				undo page. Discard the freshly allocated
+				page and return an error. */
+
+				/* When we remove a page from an undo
+				log, this is analogous to a
+				pessimistic insert in a B-tree, and we
+				must reserve the counterpart of the
+				tree latch, which is the rseg
+				mutex. We must commit the mini-transaction
+				first, because it may be holding lower-level
+				latches, such as SYNC_FSP_PAGE. */
+
+				mtr.commit();
+				mtr.start();
+				if (is_temp) {
+					mtr.set_log_mode(MTR_LOG_NO_REDO);
+				}
+
+				rseg->latch.wr_lock(SRW_LOCK_CALL);
+				err = trx_undo_free_last_page(undo, &mtr);
+				rseg->latch.wr_unlock();
+
+				if (m.second) {
+					/* We are not going to modify
+					this table after all. */
+					trx->mod_tables.erase(m.first);
+				}
+
+				if (err == DB_SUCCESS) {
+					err = DB_UNDO_RECORD_TOO_BIG;
+				}
+				goto err_exit;
+			} else {
+				/* Write log for clearing the unused
+				tail of the undo page. It might
+				contain some garbage from a previously
+				written record, and mtr_t::write()
+				will optimize away writes of unchanged
+				bytes. Failure to write this caused a
+				recovery failure when we avoided
+				reading the undo log page from the
+				data file and initialized it based on
+				redo log records (which included the
+				write of the previous garbage). */
+				mtr.memset(*undo_block, first_free,
+					   srv_page_size - first_free
+					   - FIL_PAGE_DATA_END, 0);
+			}
+
+			mtr.commit();
+		} else {
+			/* Success */
+			undo->top_page_no = undo_block->page.id().page_no();
+			mtr.commit();
+			undo->top_offset  = offset;
+			undo->top_undo_no = trx->undo_no++;
+			undo->guess_block = undo_block;
+			ut_ad(!undo->empty());
+
+			if (!is_temp) {
+				trx_mod_table_time_t& time = m.first->second;
+				ut_ad(time.valid(undo->top_undo_no));
+
+				if (!time.is_versioned()
+				    && index->table->versioned_by_id()
+				    && (!rec /* INSERT */
+					|| (update
+					    && update->affects_versioned()))) {
+					time.set_versioned(undo->top_undo_no);
+				}
+			}
+
+			if (!bulk) {
+				*roll_ptr = trx_undo_build_roll_ptr(
+					!rec, trx_sys.rseg_id(rseg, !is_temp),
+					undo->top_page_no, offset);
+			}
+
+			return(DB_SUCCESS);
+		}
+
+		ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
+
+		/* We have to extend the undo log by one page */
+
+		ut_ad(++loop_count < 2);
+		mtr.start();
+
+		if (is_temp) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		}
+
+		undo_block = trx_undo_add_page(undo, &mtr, &err);
+
+		DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
+				undo_block = NULL;);
+	} while (UNIV_LIKELY(undo_block != NULL));
+
+	if (err != DB_OUT_OF_FILE_SPACE) {
+		goto err_exit;
+	}
+
+	ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+		DB_OUT_OF_FILE_SPACE,
+		//ER_INNODB_UNDO_LOG_FULL,
+		"No more space left over in %s tablespace for allocating UNDO"
+		" log pages. Please add new data file to the tablespace or"
+		" check if filesystem is full or enable auto-extension for"
+		" the tablespace",
+		undo->rseg->space == fil_system.sys_space
+		? "system" : is_temp ? "temporary" : "undo");
+
+	goto err_exit;
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/** Copy an undo record to heap.
+@param[in]	roll_ptr	roll pointer to a record that exists
+@param[in,out]	heap		memory heap where copied */
+static
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+	roll_ptr_t		roll_ptr,
+	mem_heap_t*		heap)
+{
+  ulint rseg_id;
+  uint32_t page_no;
+  uint16_t offset;
+  bool is_insert;
+  mtr_t mtr;
+
+  trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset);
+  ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+  ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id];
+  ut_ad(rseg->is_persistent());
+
+  mtr.start();
+
+  const buf_block_t* undo_page=
+    buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr);
+
+  trx_undo_rec_t *undo_rec= undo_page
+    ? trx_undo_rec_copy(undo_page->page.frame + offset, heap)
+    : nullptr;
+
+  mtr.commit();
+  return undo_rec;
+}
+
+/** Copy an undo record to heap, to check if a secondary index record
+can be safely purged.
+@param trx_id   DB_TRX_ID corresponding to roll_ptr
+@param name     table name
+@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
+@param heap     memory heap for allocation
+@return copy of the record
+@retval nullptr if the version is visible to purge_sys.view */
+static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id,
+                                                     const table_name_t &name,
+                                                     roll_ptr_t roll_ptr,
+                                                     mem_heap_t* heap)
+{
+  {
+    purge_sys_t::view_guard check;
+    if (!check.view().changes_visible(trx_id))
+      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+  }
+  return nullptr;
+}
+
+/** Copy an undo record to heap.
+@param trx_id   DB_TRX_ID corresponding to roll_ptr
+@param name     table name
+@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
+@param heap     memory heap for allocation
+@return copy of the record
+@retval nullptr if the undo log is not available */
+static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id,
+                                             const table_name_t &name,
+                                             roll_ptr_t roll_ptr,
+                                             mem_heap_t *heap)
+{
+  {
+    purge_sys_t::end_view_guard check;
+    if (!check.view().changes_visible(trx_id))
+      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+  }
+  return nullptr;
+}
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param	rec		version of a clustered index record
+@param	index		clustered index
+@param	offsets		rec_get_offsets(rec, index)
+@param	heap		memory heap from which the memory needed is
+			allocated
+@param	old_vers	previous version or NULL if rec is the
+			first inserted version, or if history data
+			has been deleted (an error), or if the purge
+			could have removed the version
+			though it has not yet done so
+@param	v_heap		memory heap used to create vrow
+			dtuple if it is not yet created. This heap
+			diffs from "heap" above in that it could be
+			prebuilt->old_vers_heap for selection
+@param	v_row		virtual column info, if any
+@param	v_status	status determine if it is going into this
+			function by purge thread or not.
+			And if we read "after image" of undo log
+@param	undo_block	undo log block which was cached during
+			online dml apply or nullptr
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+TRANSACTIONAL_TARGET
+dberr_t
+trx_undo_prev_version_build(
+	const rec_t 	*rec,
+	dict_index_t	*index,
+	rec_offs	*offsets,
+	mem_heap_t	*heap,
+	rec_t		**old_vers,
+	mem_heap_t	*v_heap,
+	dtuple_t	**vrow,
+	ulint		v_status)
+{
+	dtuple_t*	entry;
+	trx_id_t	rec_trx_id;
+	ulint		type;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	upd_t*		update;
+	byte		info_bits;
+	ulint		cmpl_info;
+	bool		dummy_extern;
+	byte*		buf;
+
+	ut_ad(!index->table->is_temporary());
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+	*old_vers = NULL;
+
+	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+		/* The record rec is the first inserted version */
+		return DB_SUCCESS;
+	}
+
+	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	ut_ad(!index->table->skip_alter_undo);
+
+	trx_undo_rec_t*	undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY
+		? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name,
+						roll_ptr, heap)
+		: trx_undo_get_undo_rec(rec_trx_id, index->table->name,
+					roll_ptr, heap);
+	if (!undo_rec) {
+		return DB_MISSING_HISTORY;
+	}
+
+	const byte *ptr =
+		trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+				      &dummy_extern, &undo_no, &table_id);
+
+	if (table_id != index->table->id) {
+		/* The table should have been rebuilt, but purge has
+		not yet removed the undo log records for the
+		now-dropped old table (table_id). */
+		return DB_SUCCESS;
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	/* (a) If a clustered index record version is such that the
+	trx id stamp in it is bigger than purge_sys.view, then the
+	BLOBs in that version are known to exist (the purge has not
+	progressed that far);
+
+	(b) if the version is the first version such that trx id in it
+	is less than purge_sys.view, and it is not delete-marked,
+	then the BLOBs in that version are known to exist (the purge
+	cannot have purged the BLOBs referenced by that version
+	yet).
+
+	This function does not fetch any BLOBs.  The callers might, by
+	possibly invoking row_ext_create() via row_build().  However,
+	they should have all needed information in the *old_vers
+	returned by this function.  This is because *old_vers is based
+	on the transaction undo log records.  The function
+	trx_undo_page_fetch_ext() will write BLOB prefixes to the
+	transaction undo log that are at least as long as the longest
+	possible column prefix in a secondary index.  Thus, secondary
+	index entries for *old_vers can be constructed without
+	dereferencing any BLOB pointers. */
+
+	ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+					     roll_ptr, info_bits,
+					     heap, &update);
+	ut_a(ptr);
+
+	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+		/* We should confirm the existence of disowned external data,
+		if the previous version record is delete marked. If the trx_id
+		of the previous record is seen by purge view, we should treat
+		it as missing history, because the disowned external data
+		might be purged already.
+
+		The inherited external data (BLOBs) can be freed (purged)
+		after trx_id was committed, provided that no view was started
+		before trx_id. If the purge view can see the committed
+		delete-marked record by trx_id, no transactions need to access
+		the BLOB. */
+
+		if (update->info_bits & REC_INFO_DELETED_FLAG
+		    && purge_sys.is_purgeable(trx_id)) {
+			return DB_SUCCESS;
+		}
+
+		/* We have to set the appropriate extern storage bits in the
+		old version of the record: the extern bits in rec for those
+		fields that update does NOT update, as well as the bits for
+		those fields that update updates to become externally stored
+		fields. Store the info: */
+
+		entry = row_rec_to_index_entry(rec, index, offsets, heap);
+		/* The page containing the clustered index record
+		corresponding to entry is latched in mtr.  Thus the
+		following call is safe. */
+		if (!row_upd_index_replace_new_col_vals(entry, *index, update,
+							heap)) {
+			return (v_status & TRX_UNDO_PREV_IN_PURGE)
+				? DB_MISSING_HISTORY : DB_CORRUPTION;
+		}
+
+		/* Get number of externally stored columns in updated record */
+		const ulint n_ext = index->is_primary()
+			? dtuple_get_n_ext(entry) : 0;
+
+		buf = static_cast<byte*>(mem_heap_alloc(
+			heap, rec_get_converted_size(index, entry, n_ext)));
+
+		*old_vers = rec_convert_dtuple_to_rec(buf, index,
+						      entry, n_ext);
+	} else {
+		buf = static_cast<byte*>(mem_heap_alloc(
+			heap, rec_offs_size(offsets)));
+
+		*old_vers = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(*old_vers, index, true, offsets);
+		rec_set_bit_field_1(*old_vers, update->info_bits,
+				    rec_offs_comp(offsets)
+				    ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+				    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+		for (ulint i = 0; i < update->n_fields; i++) {
+			const upd_field_t* uf = upd_get_nth_field(update, i);
+			if (upd_fld_is_virtual_col(uf)) {
+				/* There are no virtual columns in
+				a clustered index record. */
+				continue;
+			}
+			const ulint n = uf->field_no;
+			ut_ad(!dfield_is_ext(&uf->new_val)
+			      == !rec_offs_nth_extern(offsets, n));
+			ut_ad(!rec_offs_nth_default(offsets, n));
+
+			if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+				if (rec_offs_nth_sql_null(offsets, n)) {
+					ut_ad(index->table->is_instant());
+					ut_ad(n >= index->n_core_fields);
+					continue;
+				}
+				ut_ad(!index->table->not_redundant());
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+					- l;
+				*b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+				compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+						    == REC_2BYTE_SQL_NULL_MASK);
+				continue;
+			}
+
+			ulint len;
+			memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+			       uf->new_val.data, uf->new_val.len);
+			if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+				ut_ad(len == UNIV_SQL_NULL);
+				ut_ad(!rec_offs_comp(offsets));
+				ut_ad(uf->new_val.len
+				      == rec_get_nth_field_size(rec, n));
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				*(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+					&= byte(~REC_1BYTE_SQL_NULL_MASK);
+			}
+		}
+	}
+
+	/* Set the old value (which is the after image of an update) in the
+	update vector to dtuple vrow */
+	if (v_status & TRX_UNDO_GET_OLD_V_VALUE) {
+		row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update,
+				     false, NULL, NULL);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_dbg);
+	ut_a(!rec_offs_any_null_extern(
+		*old_vers, rec_get_offsets(*old_vers, index, offsets_dbg,
+					   index->n_core_fields,
+					   ULINT_UNDEFINED, &heap)));
+#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+
+	if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		if (!(*vrow)) {
+			*vrow = dtuple_create_with_vcol(
+				v_heap ? v_heap : heap,
+				dict_table_get_n_cols(index->table),
+				dict_table_get_n_v_cols(index->table));
+			dtuple_init_v_fld(*vrow);
+		}
+
+		ut_ad(index->table->n_v_cols);
+		trx_undo_read_v_cols(index->table, ptr, *vrow,
+				     v_status & TRX_UNDO_PREV_IN_PURGE);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Read virtual column value from undo log
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in,out]	row		the dtuple to fill
+@param[in]	in_purge	whether this is called by purge */
+void
+trx_undo_read_v_cols(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	dtuple_t*		row,
+	bool			in_purge)
+{
+	const byte*     end_ptr;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+
+	end_ptr = ptr + mach_read_from_2(ptr);
+	ptr += 2;
+	while (ptr < end_ptr) {
+		dfield_t* dfield;
+		const byte* field;
+		uint32_t field_no, len, orig_len;
+
+		field_no = mach_read_next_compressed(
+				const_cast<const byte**>(&ptr));
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			ptr = trx_undo_read_v_idx(
+				table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+		}
+
+		ptr = trx_undo_rec_get_col_val(
+			ptr, &field, &len, &orig_len);
+
+		/* The virtual column is no longer indexed or does not exist.
+		This needs to put after trx_undo_rec_get_col_val() so the
+		undo ptr advances */
+		if (field_no == FIL_NULL) {
+			ut_ad(is_virtual);
+			continue;
+		}
+
+		if (is_virtual) {
+			dict_v_col_t*	vcol = dict_table_get_nth_v_col(
+				table, field_no);
+
+			dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+			if (!in_purge
+			    || dfield_get_type(dfield)->mtype == DATA_MISSING) {
+				dict_col_copy_type(
+					&vcol->m_col,
+					dfield_get_type(dfield));
+				dfield_set_data(dfield, field, len);
+			}
+		}
+	}
+
+	ut_ad(ptr == end_ptr);
+}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000..59c9a319
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,927 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#include <my_service_manager.h>
+#include <mysql/service_wsrep.h>
+
+#include "fsp0fsp.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t	trx_rollback_clean_thread_key;
+#endif
+
+/** true if trx_rollback_all_recovered() thread is active */
+bool			trx_rollback_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+const trx_t*		trx_roll_crash_recv_trx;
+
+/** Finish transaction rollback.
+@return	whether the rollback was completed normally
+@retval	false	if the rollback was aborted by shutdown  */
+inline bool trx_t::rollback_finish()
+{
+  mod_tables.clear();
+  apply_online_log= false;
+  if (UNIV_LIKELY(error_state == DB_SUCCESS))
+  {
+    commit();
+    return true;
+  }
+
+  ut_a(error_state == DB_INTERRUPTED);
+  ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+  ut_a(!srv_undo_sources);
+  ut_ad(srv_fast_shutdown);
+  ut_d(in_rollback= false);
+  if (trx_undo_t *&undo= rsegs.m_redo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  commit_low();
+  commit_cleanup();
+  return false;
+}
+
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
+{
+  mem_heap_t *heap= mem_heap_create(512);
+  roll_node_t *roll_node= roll_node_create(heap);
+  roll_node->savept= savept;
+
+  ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+  {
+    const auto s= state;
+    ut_ad(s == TRX_STATE_ACTIVE ||
+          s == TRX_STATE_PREPARED ||
+          s == TRX_STATE_PREPARED_RECOVERED);
+    if (savept)
+    {
+      ut_ad(s == TRX_STATE_ACTIVE);
+      ut_ad(mysql_thd);
+      ut_ad(!is_recovered);
+    }
+  }
+#endif
+
+  error_state = DB_SUCCESS;
+
+  if (has_logged())
+  {
+    ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+    que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+                                                 nullptr);
+    ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+                                       (que_node_get_parent(thr))));
+    que_run_threads(thr);
+    que_run_threads(roll_node->undo_thr);
+
+    /* Free the memory reserved by the undo graph. */
+    que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+  }
+
+  if (!savept)
+  {
+    rollback_finish();
+    MONITOR_INC(MONITOR_TRX_ROLLBACK);
+  }
+  else
+  {
+    /* There must not be partial rollback if transaction was chosen as deadlock
+    victim. Galera transaction abort can be invoked during partial rollback. */
+    ut_ad(!(lock.was_chosen_as_deadlock_victim & 1));
+    ut_a(error_state == DB_SUCCESS);
+    const undo_no_t limit= savept->least_undo_no;
+    apply_online_log= false;
+    for (trx_mod_tables_t::iterator i= mod_tables.begin();
+         i != mod_tables.end(); )
+    {
+      trx_mod_tables_t::iterator j= i++;
+      ut_ad(j->second.valid());
+      if (j->second.rollback(limit))
+        mod_tables.erase(j);
+      else if (!apply_online_log)
+        apply_online_log= j->first->is_active_ddl();
+    }
+    MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+  }
+
+  mem_heap_free(heap);
+}
+
+/** Initiate rollback.
+@param savept     savepoint
+@return error code or DB_SUCCESS */
+dberr_t trx_t::rollback(trx_savept_t *savept)
+{
+  ut_ad(!mutex_is_owner());
+  if (state == TRX_STATE_NOT_STARTED)
+  {
+    error_state= DB_SUCCESS;
+    return DB_SUCCESS;
+  }
+  ut_ad(state == TRX_STATE_ACTIVE);
+#ifdef WITH_WSREP
+  if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+    wsrep_handle_SR_rollback(nullptr, mysql_thd);
+#endif /* WITH_WSREP */
+  rollback_low(savept);
+  return error_state;
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions,
+	then the transaction object does not have an InnoDB session
+	object, and we set a dummy session that we use for all MySQL
+	transactions. */
+
+	trx->rollback_low();
+
+	trx->op_info = "";
+
+	return(trx->error_state);
+}
+
+/** Rollback a transaction used in MySQL
+@param[in, out]	trx	transaction
+@return error code or DB_SUCCESS */
+dberr_t trx_rollback_for_mysql(trx_t* trx)
+{
+	/* We are reading trx->state without holding trx->mutex
+	here, because the rollback should be invoked for a running
+	active MySQL transaction (or recovered prepared transaction)
+	that is associated with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx->will_lock = false;
+		ut_ad(trx->mysql_thd);
+		/* Galera transaction abort can be invoked from MDL acquision
+		code, so trx->lock.was_chosen_as_deadlock_victim can be set
+		even if trx->state is TRX_STATE_NOT_STARTED. */
+		ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
+#ifdef WITH_WSREP
+		trx->wsrep= false;
+		trx->lock.was_chosen_as_deadlock_victim= false;
+#endif
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->mysql_thd);
+		ut_ad(!trx->is_recovered);
+		ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		ut_ad(!trx->is_autocommit_non_locking());
+		if (trx->rsegs.m_redo.undo) {
+			/* The XA ROLLBACK of a XA PREPARE transaction
+			will consist of multiple mini-transactions.
+
+			As the very first step of XA ROLLBACK, we must
+			change the undo log state back from
+			TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order
+			to ensure that recovery will complete the
+			rollback.
+
+			Failure to perform this step could cause a
+			situation where we would roll back part of
+			a XA PREPARE transaction, the server would be
+			killed, and finally, the transaction would be
+			recovered in XA PREPARE state, with some of
+			the actions already having been rolled back. */
+			ut_ad(trx->rsegs.m_redo.undo->rseg
+			      == trx->rsegs.m_redo.rseg);
+			mtr_t		mtr;
+			mtr.start();
+			if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+				trx_undo_set_state_at_prepare(trx, undo, true,
+							      &mtr);
+			}
+			/* Write the redo log for the XA ROLLBACK
+			state change to the global buffer. It is
+			not necessary to flush the redo log. If
+			a durable log write of a later mini-transaction
+			takes place for whatever reason, then this state
+			change will be durable as well. */
+			mtr.commit();
+			ut_ad(mtr.commit_lsn() > 0);
+		}
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		ut_ad(!trx->is_autocommit_non_locking());
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	dberr_t	err;
+
+	/* We are reading trx->state without holding trx->mutex
+	here, because the statement rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->mysql_thd);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->mysql_thd);
+		ut_ad(!trx->is_recovered);
+		ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+
+		trx->op_info = "rollback of SQL statement";
+
+		err = trx->rollback(&trx->last_sql_stat_start);
+
+		if (trx->fts_trx != NULL) {
+			fts_savepoint_rollback_last_stmt(trx);
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+		trx->end_bulk_insert();
+
+		trx->op_info = "";
+
+		return(err);
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The statement rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+		if (!strcmp(savep->name, name)) {
+			return(savep);
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
+{
+	UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+	ut_free(savep->name);
+	ut_free(savep);
+}
+
+/** Discard all savepoints starting from a particular savepoint.
+@param savept    first savepoint to discard */
+void trx_t::savepoints_discard(trx_named_savept_t *savept)
+{
+  while (savept)
+  {
+    auto next= UT_LIST_GET_NEXT(trx_savepoints, savept);
+    trx_roll_savepoint_free(this, savept);
+    savept= next;
+  }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	trx_named_savept_t*	savep,	/*!< in/out: savepoint */
+	int64_t*		mysql_binlog_cache_pos)
+					/*!< out: the MySQL binlog
+					cache position corresponding
+					to this savepoint; MySQL needs
+					this information to remove the
+					binlog entries of the queries
+					executed after the savepoint */
+{
+	dberr_t	err;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->mysql_thd);
+
+	/* Free all savepoints strictly later than savep. */
+
+	trx->savepoints_discard(UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx->rollback(&savep->savept);
+
+	/* Store the current undo_no of the transaction so that
+	we know where to roll back if we have to roll back the
+	next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+	return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	/* We are reading trx->state without holding trx->mutex
+	here, because the savepoint rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->mysql_thd);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep == NULL) {
+		return(DB_NO_SAVEPOINT);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ib::error() << "Transaction has a savepoint "
+			<< savep->name
+			<< " though it is not started";
+		return(DB_ERROR);
+
+	case TRX_STATE_ACTIVE:
+
+		return(trx_rollback_to_savepoint_for_mysql_low(
+				trx, savep, mysql_binlog_cache_pos));
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The savepoint rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t		binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	trx_start_if_not_started_xa(trx, false);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep) {
+		/* There is a savepoint with the same name: free that */
+
+		UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+		ut_free(savep->name);
+		ut_free(savep);
+	}
+
+	/* Create a new savepoint and add it as the last in the list */
+
+	savep = static_cast<trx_named_savept_t*>(
+		ut_malloc_nokey(sizeof(*savep)));
+
+	savep->name = mem_strdup(savepoint_name);
+
+	savep->savept.least_undo_no = trx->undo_no;
+	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+	savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+	UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
+
+	trx->end_bulk_insert();
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED, true));
+	ut_ad(trx->mysql_thd);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep != NULL) {
+		trx_roll_savepoint_free(trx, savep);
+	}
+
+	return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	roll_node_t*	roll_node;
+	const trx_id_t	trx_id = trx->id;
+
+	ut_ad(trx_id);
+
+	heap = mem_heap_create(512);
+
+	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	roll_node = roll_node_create(heap);
+
+	thr->child = roll_node;
+	roll_node->common.parent = thr;
+
+	trx->graph = fork;
+
+	ut_a(thr == que_fork_start_command(fork));
+
+	trx_roll_crash_recv_trx	= trx;
+
+	const bool dictionary_locked = trx->dict_operation;
+
+	if (dictionary_locked) {
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	que_run_threads(thr);
+	ut_a(roll_node->undo_thr != NULL);
+
+	que_run_threads(roll_node->undo_thr);
+
+	que_graph_free(
+		static_cast<que_t*>(roll_node->undo_thr->common.parent));
+
+	if (UNIV_UNLIKELY(!trx->rollback_finish())) {
+		ut_ad(!dictionary_locked);
+	} else {
+		ib::info() << "Rolled back recovered transaction " << trx_id;
+	}
+
+	if (dictionary_locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	trx_roll_crash_recv_trx	= NULL;
+}
+
+
+struct trx_roll_count_callback_arg
+{
+  uint32_t n_trx;
+  uint64_t n_rows;
+  trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {}
+};
+
+
+static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
+                                       trx_roll_count_callback_arg *arg)
+{
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
+    {
+      arg->n_trx++;
+      arg->n_rows+= trx->undo_no;
+    }
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress()
+{
+	time_t now = time(NULL);
+	mysql_mutex_lock(&recv_sys.mutex);
+	bool report = recv_sys.report(now);
+	mysql_mutex_unlock(&recv_sys.mutex);
+
+	if (report) {
+		trx_roll_count_callback_arg arg;
+
+		/* Get number of recovered active transactions and number of
+		rows they modified. Numbers must be accurate, because only this
+		thread is allowed to touch recovered transactions. */
+		trx_sys.rw_trx_hash.iterate_no_dups(
+			trx_roll_count_callback, &arg);
+
+		if (arg.n_rows > 0) {
+			service_manager_extend_timeout(
+				INNODB_EXTEND_TIMEOUT_INTERVAL,
+				"To roll back: " UINT32PF " transactions, "
+				UINT64PF " rows", arg.n_trx, arg.n_rows);
+		}
+
+		ib::info() << "To roll back: " << arg.n_trx
+			   << " transactions, " << arg.n_rows << " rows";
+
+	}
+}
+
+
+static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
+                                               std::vector<trx_t*> *trx_list)
+{
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    trx->mutex_lock();
+    if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
+      trx_list->push_back(trx);
+    trx->mutex_unlock();
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+
+/**
+  Rollback any incomplete transactions which were encountered in crash recovery.
+
+  If the transaction already was committed, then we clean up a possible insert
+  undo log. If the transaction was not yet committed, then we roll it back.
+
+  Note: For XA recovered transactions, we rely on MySQL to
+  do rollback. They will be in TRX_STATE_PREPARED state. If the server
+  is shutdown and they are still lingering in trx_sys_t::trx_list
+  then the shutdown will hang.
+
+  @param[in]  all  true=roll back all recovered active transactions;
+                   false=roll back any incomplete dictionary transaction
+*/
+
+void trx_rollback_recovered(bool all)
+{
+  std::vector<trx_t*> trx_list;
+
+  ut_a(srv_force_recovery <
+       ulong(all ? SRV_FORCE_NO_TRX_UNDO : SRV_FORCE_NO_DDL_UNDO));
+
+  /*
+    Collect list of recovered ACTIVE transaction ids first. Once collected, no
+    other thread is allowed to modify or remove these transactions from
+    rw_trx_hash.
+  */
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
+                                      &trx_list);
+
+  while (!trx_list.empty())
+  {
+    trx_t *trx= trx_list.back();
+    trx_list.pop_back();
+
+    ut_ad(trx);
+    ut_d(trx->mutex_lock());
+    ut_ad(trx->is_recovered);
+    ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+    ut_d(trx->mutex_unlock());
+
+    if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
+        srv_fast_shutdown)
+      goto discard;
+
+    if (all || trx->dict_operation || trx->has_stats_table_lock())
+    {
+      trx_rollback_active(trx);
+      if (trx->error_state != DB_SUCCESS)
+      {
+        ut_ad(trx->error_state == DB_INTERRUPTED);
+        trx->error_state= DB_SUCCESS;
+        ut_ad(!srv_undo_sources);
+        ut_ad(srv_fast_shutdown);
+discard:
+        /* Note: before kill_server() invoked innobase_end() via
+        unireg_end(), it invoked close_connections(), which should initiate
+        the rollback of any user transactions via THD::cleanup() in the
+        connection threads, and wait for all THD::cleanup() to complete.
+        So, no active user transactions should exist at this point.
+
+        srv_undo_sources=false was cleared early in innobase_end().
+
+        Generally, the server guarantees that all connections using
+        InnoDB must be disconnected by the time we are reaching this code,
+        be it during shutdown or UNINSTALL PLUGIN.
+
+        Because there is no possible race condition with any
+        concurrent user transaction, we do not have to invoke
+        trx->commit_state() or wait for !trx->is_referenced()
+        before trx_sys.deregister_rw(trx). */
+        trx_sys.deregister_rw(trx);
+        trx_free_at_shutdown(trx);
+      }
+      else
+        trx->free();
+    }
+  }
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*)
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (trx_sys.rw_trx_hash.size()) {
+		ib::info() << "Starting in background the rollback of"
+			" recovered transactions";
+		trx_rollback_recovered(true);
+		ib::info() << "Rollback of non-prepared transactions"
+			" completed";
+	}
+
+	trx_rollback_is_active = false;
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	ut_ad(trx->mutex_is_owner());
+	heap = mem_heap_create(512);
+	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	thr->child = row_undo_node_create(trx, thr, heap);
+
+	return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	undo_no_t	roll_limit)	/*!< in: rollback to undo no (for
+					partial undo), 0 if we are rolling back
+					the entire transaction */
+{
+	/* Initialize the rollback field in the transaction */
+
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(!trx->roll_limit);
+	ut_ad(!trx->in_rollback);
+
+	trx->roll_limit = roll_limit;
+	trx->in_rollback = true;
+
+	ut_a(trx->roll_limit <= trx->undo_no);
+
+	trx->pages_undone = 0;
+
+	/* Build a 'query' graph which will perform the undo operations */
+
+	que_t*	roll_graph = trx_roll_graph_build(trx);
+
+	trx->graph = roll_graph;
+
+	return(que_fork_start_command(roll_graph));
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+	node->state = ROLL_NODE_SEND;
+
+	node->common.type = QUE_NODE_ROLLBACK;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = ROLL_NODE_SEND;
+	}
+
+	if (node->state == ROLL_NODE_SEND) {
+		trx_t*		trx;
+		ib_id_t		roll_limit;
+
+		trx = thr_get_trx(thr);
+
+		node->state = ROLL_NODE_WAIT;
+
+		ut_a(node->undo_thr == NULL);
+
+		roll_limit = node->savept ? node->savept->least_undo_no : 0;
+
+		trx->mutex_lock();
+
+		trx_commit_or_rollback_prepare(trx);
+
+		node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+		trx->mutex_unlock();
+	} else {
+		ut_ad(node->state == ROLL_NODE_WAIT);
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000..760c4e70
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,713 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+
+#ifdef WITH_WSREP
+# include <mysql/service_wsrep.h>
+
+/** The offset to WSREP XID headers, after TRX_RSEG */
+# define TRX_RSEG_WSREP_XID_INFO      TRX_RSEG_MAX_TRX_ID + 16 + 512
+
+/** WSREP XID format (1 if present and valid, 0 if not present) */
+# define TRX_RSEG_WSREP_XID_FORMAT    TRX_RSEG_WSREP_XID_INFO
+/** WSREP XID GTRID length */
+# define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4
+/** WSREP XID bqual length */
+# define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8
+/** WSREP XID data (XIDDATASIZE bytes) */
+# define TRX_RSEG_WSREP_XID_DATA      TRX_RSEG_WSREP_XID_INFO + 12
+
+# ifdef UNIV_DEBUG
+/** The latest known WSREP XID sequence number */
+static long long wsrep_seqno = -1;
+# endif /* UNIV_DEBUG */
+/** The latest known WSREP XID UUID */
+static unsigned char wsrep_uuid[16];
+
+/** Write the WSREP XID information into rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini transaction */
+static void
+trx_rseg_write_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr)
+{
+	DBUG_ASSERT(xid->gtrid_length >= 0);
+	DBUG_ASSERT(xid->bqual_length >= 0);
+	DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				       + rseg_header->page.frame,
+				       uint32_t(xid->formatID));
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				       + rseg_header->page.frame,
+				       uint32_t(xid->gtrid_length));
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				       + rseg_header->page.frame,
+				       uint32_t(xid->bqual_length));
+
+	const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+						    + xid->bqual_length);
+	mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+				      TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+				      + rseg_header->page.frame,
+				      xid->data, xid_length);
+	if (xid_length < XIDDATASIZE
+	    && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+		      + rseg_header->page.frame, field_ref_zero,
+		      XIDDATASIZE - xid_length)) {
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+			    XIDDATASIZE - xid_length, 0);
+	}
+}
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr)
+{
+	ut_ad(wsrep_is_wsrep_xid(xid));
+
+#ifdef UNIV_DEBUG
+	/* Check that seqno is monotonically increasing */
+	long long xid_seqno = wsrep_xid_seqno(xid);
+	const byte* xid_uuid = wsrep_xid_uuid(xid);
+
+	if (xid_seqno != -1
+	    && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
+		ut_ad(xid_seqno > wsrep_seqno);
+	} else {
+		memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
+	}
+	wsrep_seqno = xid_seqno;
+#endif /* UNIV_DEBUG */
+	trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
+}
+
+static dberr_t trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
+{
+  dberr_t err;
+  buf_block_t *rseg_header = trx_sys.rseg_array[0].get(mtr, &err);
+
+  if (UNIV_UNLIKELY(!rseg_header))
+    return err;
+
+  /* We must make check against wsrep_uuid here, the
+  trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with xid
+  contents in debug mode and the memcmp() will never give nonzero
+  result. */
+  const bool must_clear_rsegs=
+    memcmp(wsrep_uuid, wsrep_xid_uuid(xid), sizeof wsrep_uuid);
+
+  if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                     rseg_header->page.frame)))
+    trx_rseg_format_upgrade(rseg_header, mtr);
+
+  trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+  if (must_clear_rsegs)
+    /* Because the UUID part of the WSREP XID differed from
+    current_xid_uuid, the WSREP group UUID was changed, and we must
+    reset the XID in all rollback segment headers. */
+    for (ulint rseg_id= 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id)
+      if (buf_block_t* block= trx_sys.rseg_array[rseg_id].get(mtr, &err))
+        mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+                    TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE -
+                    TRX_RSEG_WSREP_XID_INFO, 0);
+  return err;
+}
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in]	xid		WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid)
+{
+	mtr_t	mtr;
+	mtr.start();
+	trx_rseg_update_wsrep_checkpoint(xid, &mtr);
+	mtr.commit();
+}
+
+/** Read the WSREP XID information in rollback segment header.
+@param[in]	rseg_header	Rollback segment header
+@param[out]	xid		Transaction XID
+@return	whether the WSREP XID was present */
+static
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
+{
+	int formatID = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				 + rseg_header->page.frame));
+	if (formatID == 0) {
+		return false;
+	}
+
+	xid.formatID = formatID;
+	xid.gtrid_length = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				 + rseg_header->page.frame));
+
+	xid.bqual_length = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				 + rseg_header->page.frame));
+
+	memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+	       + rseg_header->page.frame, XIDDATASIZE);
+
+	return true;
+}
+
+/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
+@param[in]	page	TRX_SYS page
+@param[out]	xid	WSREP XID (if present)
+@return	whether the WSREP XID is present */
+static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
+{
+	if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			     + TRX_SYS_WSREP_XID_MAGIC_N_FLD
+			     + page)
+	    != TRX_SYS_WSREP_XID_MAGIC_N) {
+		return false;
+	}
+
+	xid.formatID = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_FORMAT + page));
+	xid.gtrid_length = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_GTRID_LEN + page));
+	xid.bqual_length = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_BQUAL_LEN + page));
+	memcpy(xid.data,
+	       TRX_SYS + TRX_SYS_WSREP_XID_INFO
+	       + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
+	return true;
+}
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out]	xid	WSREP XID
+@return	whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid)
+{
+	mtr_t		mtr;
+	long long       max_xid_seqno = -1;
+	bool		found = false;
+
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
+	     rseg_id++, mtr.commit()) {
+		mtr.start();
+		const buf_block_t* sys = trx_sysf_get(&mtr, false);
+		if (UNIV_UNLIKELY(!sys)) {
+			continue;
+		}
+		const uint32_t page_no = trx_sysf_rseg_get_page_no(
+			sys, rseg_id);
+
+		if (page_no == FIL_NULL) {
+			continue;
+		}
+
+		const buf_block_t* rseg_header = buf_page_get_gen(
+			page_id_t(trx_sysf_rseg_get_space(sys, rseg_id),
+				  page_no),
+			0, RW_S_LATCH, nullptr, BUF_GET, &mtr);
+
+		if (!rseg_header) {
+			continue;
+		}
+
+		if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+				     + rseg_header->page.frame)) {
+			continue;
+		}
+
+		XID tmp_xid;
+		long long tmp_seqno = 0;
+		if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
+		    && (tmp_seqno = wsrep_xid_seqno(&tmp_xid))
+		    > max_xid_seqno) {
+			found = true;
+			max_xid_seqno = tmp_seqno;
+			xid = tmp_xid;
+			memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid),
+			       sizeof wsrep_uuid);
+		}
+	}
+
+	return found;
+}
+#endif /* WITH_WSREP */
+
+buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const
+{
+  if (!space)
+  {
+    if (err) *err= DB_TABLESPACE_NOT_FOUND;
+    return nullptr;
+  }
+  return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr,
+                          BUF_GET, mtr, err);
+}
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out]	rseg_header	rollback segment header page
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
+{
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+  /* Clear also possible garbage at the end of the page. Old
+  InnoDB versions did not initialize unused parts of pages. */
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+              srv_page_size
+              - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+              0);
+}
+
+/** Create a rollback segment header.
+@param[in,out]  space           system, undo, or temporary tablespace
+@param[in]      rseg_id         rollback segment identifier
+@param[in]      max_trx_id      new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
+@return the created rollback segment
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+                                    trx_id_t max_trx_id, mtr_t *mtr,
+                                    dberr_t *err)
+{
+  ut_ad(mtr->memo_contains(*space));
+  buf_block_t *block=
+    fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr, err);
+  if (block)
+  {
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG +
+                                block->page.frame));
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG +
+                                block->page.frame));
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_MAX_TRX_ID + TRX_RSEG +
+                                block->page.frame));
+
+    /* Initialize the history list */
+    flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                   block->page.frame, max_trx_id);
+
+    /* Reset the undo log slots */
+    mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG, TRX_RSEG_N_SLOTS * 4,
+                0xff);
+  }
+  return block;
+}
+
+void trx_rseg_t::destroy()
+{
+  latch.destroy();
+
+  /* There can't be any active transactions. */
+  ut_a(!UT_LIST_GET_LEN(undo_list));
+
+  for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+       undo= next)
+  {
+    next= UT_LIST_GET_NEXT(undo_list, undo);
+    UT_LIST_REMOVE(undo_cached, undo);
+    ut_free(undo);
+  }
+}
+
+void trx_rseg_t::init(fil_space_t *space, uint32_t page)
+{
+  latch.SRW_LOCK_INIT(trx_rseg_latch_key);
+  ut_ad(!this->space);
+  this->space= space;
+  page_no= page;
+  last_page_no= FIL_NULL;
+  curr_size= 1;
+
+  UT_LIST_INIT(undo_list, &trx_undo_t::undo_list);
+  UT_LIST_INIT(undo_cached, &trx_undo_t::undo_list);
+}
+
+void trx_rseg_t::reinit(uint32_t page)
+{
+  ut_ad(is_persistent());
+  ut_ad(page_no == page);
+  ut_a(!UT_LIST_GET_LEN(undo_list));
+  ut_ad(!history_size || UT_LIST_GET_FIRST(undo_cached));
+
+  history_size= 0;
+  page_no= page;
+
+  for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+       undo= next)
+  {
+    next= UT_LIST_GET_NEXT(undo_list, undo);
+    UT_LIST_REMOVE(undo_cached, undo);
+    MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+    ut_free(undo);
+  }
+
+  ut_ad(!is_referenced());
+  clear_needs_purge();
+  last_commit_and_offset= 0;
+  last_page_no= FIL_NULL;
+  curr_size= 1;
+}
+
+/** Read the undo log lists.
+@param[in,out]  rseg            rollback segment
+@param[in,out]  max_trx_id      maximum observed transaction identifier
+@param[in]      rseg_header     rollback segment header
+@return error code */
+static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+                                   const buf_block_t *rseg_header)
+{
+  ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+  for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++)
+  {
+    uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i);
+    if (page_no != FIL_NULL)
+    {
+      const trx_undo_t *undo= trx_undo_mem_create_at_db_start(rseg, i, page_no,
+                                                              max_trx_id);
+      if (!undo)
+        return DB_CORRUPTION;
+      rseg->curr_size+= undo->size;
+      MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Restore the state of a persistent rollback segment.
+@param[in,out]	rseg		persistent rollback segment
+@param[in,out]	max_trx_id	maximum observed transaction identifier
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+                                    mtr_t *mtr)
+{
+  if (!rseg->space)
+    return DB_TABLESPACE_NOT_FOUND;
+  dberr_t err;
+  const buf_block_t *rseg_hdr=
+    buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr,
+                     &err);
+  if (!rseg_hdr)
+    return err;
+
+  if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->page.frame))
+  {
+    trx_id_t id= mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                  rseg_hdr->page.frame);
+
+    if (id > max_trx_id)
+      max_trx_id= id;
+
+    const byte *binlog_name=
+      TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame;
+    if (*binlog_name)
+    {
+      lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                  (FIL_PAGE_LSN + rseg_hdr->page.frame));
+      static_assert(TRX_RSEG_BINLOG_NAME_LEN ==
+                    sizeof trx_sys.recovered_binlog_filename, "compatibility");
+      if (lsn > trx_sys.recovered_binlog_lsn)
+      {
+        trx_sys.recovered_binlog_lsn= lsn;
+        trx_sys.recovered_binlog_offset=
+          mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET +
+                           rseg_hdr->page.frame);
+        memcpy(trx_sys.recovered_binlog_filename, binlog_name,
+               TRX_RSEG_BINLOG_NAME_LEN);
+      }
+
+#ifdef WITH_WSREP
+      trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
+#endif
+    }
+  }
+
+  if (srv_operation == SRV_OPERATION_RESTORE)
+    /* mariabackup --prepare only deals with
+    the redo log and the data files, not with
+    transactions or the data dictionary. */
+    return DB_SUCCESS;
+
+  /* Initialize the undo log lists according to the rseg header */
+
+  rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                                     rseg_hdr->page.frame) + 1;
+  err= trx_undo_lists_init(rseg, max_trx_id, rseg_hdr);
+  if (err != DB_SUCCESS);
+  else if (auto len= flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY +
+                                  rseg_hdr->page.frame))
+  {
+    rseg->history_size+= len;
+
+    fil_addr_t node_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY +
+                                        rseg_hdr->page.frame);
+    node_addr.boffset= static_cast<uint16_t>(node_addr.boffset -
+                                             TRX_UNDO_HISTORY_NODE);
+    rseg->last_page_no= node_addr.page;
+
+    const buf_block_t* block=
+      buf_page_get_gen(page_id_t(rseg->space->id, node_addr.page),
+                       0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err);
+    if (!block)
+      return err;
+
+    trx_id_t id= mach_read_from_8(block->page.frame + node_addr.boffset +
+                                  TRX_UNDO_TRX_ID);
+    if (id > max_trx_id)
+      max_trx_id= id;
+    id= mach_read_from_8(block->page.frame + node_addr.boffset +
+                         TRX_UNDO_TRX_NO);
+    if (id > max_trx_id)
+      max_trx_id= id;
+
+    rseg->set_last_commit(node_addr.boffset, id);
+    unsigned purge= mach_read_from_2(block->page.frame + node_addr.boffset +
+                                     TRX_UNDO_NEEDS_PURGE);
+    ut_ad(purge <= 1);
+    if (purge != 0)
+      rseg->set_needs_purge();
+
+    if (rseg->last_page_no != FIL_NULL)
+      /* There is no need to cover this operation by the purge
+      mutex because we are still bootstrapping. */
+      purge_sys.purge_queue.push(*rseg);
+  }
+
+  return err;
+}
+
+/** Read binlog metadata from the TRX_SYS page, in case we are upgrading
+from MySQL or a MariaDB version older than 10.3.5. */
+static void trx_rseg_init_binlog_info(const page_t* page)
+{
+	if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+			     + page)
+	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+		memcpy(trx_sys.recovered_binlog_filename,
+		       TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME
+		       + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN);
+		trx_sys.recovered_binlog_offset = mach_read_from_8(
+			TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+			+ TRX_SYS + page);
+	}
+
+#ifdef WITH_WSREP
+	trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
+#endif
+}
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init()
+{
+	trx_id_t max_trx_id = 0;
+
+	*trx_sys.recovered_binlog_filename = '\0';
+	trx_sys.recovered_binlog_offset = 0;
+#ifdef WITH_WSREP
+	trx_sys.recovered_wsrep_xid.null();
+	XID wsrep_sys_xid;
+	wsrep_sys_xid.null();
+	bool wsrep_xid_in_rseg_found = false;
+#endif
+	mtr_t mtr;
+	dberr_t err = DB_SUCCESS;
+
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+		mtr.start();
+		if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+			if (rseg_id == 0) {
+				/* In case this is an upgrade from
+				before MariaDB 10.3.5, fetch the base
+				information from the TRX_SYS page. */
+				max_trx_id = mach_read_from_8(
+					TRX_SYS + TRX_SYS_TRX_ID_STORE
+					+ sys->page.frame);
+				trx_rseg_init_binlog_info(sys->page.frame);
+#ifdef WITH_WSREP
+				wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+#endif
+			}
+
+			const uint32_t	page_no = trx_sysf_rseg_get_page_no(
+				sys, rseg_id);
+			if (page_no != FIL_NULL) {
+				trx_rseg_t& rseg = trx_sys.rseg_array[rseg_id];
+				rseg.init(fil_space_get(
+						  trx_sysf_rseg_get_space(
+							  sys, rseg_id)),
+					  page_no);
+				ut_ad(rseg.is_persistent());
+				if ((err = trx_rseg_mem_restore(
+					     &rseg, max_trx_id, &mtr))
+				    != DB_SUCCESS) {
+					mtr.commit();
+					break;
+				}
+#ifdef WITH_WSREP
+				if (!wsrep_sys_xid.is_null() &&
+				    !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
+					wsrep_xid_in_rseg_found = true;
+					ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
+						     wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
+						     sizeof wsrep_uuid)
+					      || wsrep_xid_seqno(
+						      &wsrep_sys_xid)
+					      <= wsrep_xid_seqno(
+						      &trx_sys.recovered_wsrep_xid));
+				}
+#endif
+			}
+		}
+
+		mtr.commit();
+	}
+
+	if (err != DB_SUCCESS) {
+		for (auto& rseg : trx_sys.rseg_array) {
+			while (auto u = UT_LIST_GET_FIRST(rseg.undo_list)) {
+				UT_LIST_REMOVE(rseg.undo_list, u);
+				ut_free(u);
+			}
+		}
+		return err;
+	}
+
+#ifdef WITH_WSREP
+	if (!wsrep_sys_xid.is_null()) {
+		/* Upgrade from a version prior to 10.3.5,
+		where WSREP XID was stored in TRX_SYS page.
+		If no rollback segment has a WSREP XID set,
+		we must copy the XID found in TRX_SYS page
+		to rollback segments. */
+		mtr.start();
+
+		if (!wsrep_xid_in_rseg_found) {
+			trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
+		}
+
+		/* Finally, clear WSREP XID in TRX_SYS page. */
+		mtr.memset(trx_sysf_get(&mtr),
+			   TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+			   TRX_SYS_WSREP_XID_LEN, 0);
+		mtr.commit();
+	}
+#endif
+
+	trx_sys.init_max_trx_id(max_trx_id + 1);
+	return DB_SUCCESS;
+}
+
+/** Create the temporary rollback segments. */
+dberr_t trx_temp_rseg_create(mtr_t *mtr)
+{
+  for (ulong i= 0; i < array_elements(trx_sys.temp_rsegs); i++)
+  {
+    mtr->start();
+    mtr->set_log_mode(MTR_LOG_NO_REDO);
+    mtr->x_lock_space(fil_system.temp_space);
+    dberr_t err;
+    buf_block_t *rblock=
+      trx_rseg_header_create(fil_system.temp_space, i, 0, mtr, &err);
+    if (UNIV_UNLIKELY(!rblock))
+    {
+      mtr->commit();
+      return err;
+    }
+    trx_sys.temp_rsegs[i].init(fil_system.temp_space,
+                               rblock->page.id().page_no());
+    mtr->commit();
+  }
+  return DB_SUCCESS;
+}
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	trx		committing transaction
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+                                   mtr_t *mtr)
+{
+	DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset);
+
+	const size_t len = strlen(trx->mysql_log_file_name) + 1;
+
+	ut_ad(len > 1);
+
+	if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN)) {
+		return;
+	}
+
+	mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_BINLOG_OFFSET
+				       + rseg_header->page.frame,
+				       trx->mysql_log_offset);
+
+	void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->page.frame;
+
+	if (memcmp(trx->mysql_log_file_name, name, len)) {
+		mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
+	}
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000..d344f3a0
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,357 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+#include "mysqld.h"
+#include "sql_error.h"
+
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The transaction system */
+trx_sys_t		trx_sys;
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+uint	trx_rseg_n_slots_debug = 0;
+#endif
+
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset()
+{
+	if (!*trx_sys.recovered_binlog_filename) {
+		return;
+	}
+
+	ib::info() << "Last binlog file '"
+		<< trx_sys.recovered_binlog_filename
+		<< "', position "
+		<< trx_sys.recovered_binlog_offset;
+}
+
+/** Find an available rollback segment.
+@param[in]	sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header)
+{
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+		if (trx_sysf_rseg_get_page_no(sys_header, rseg_id)
+		    == FIL_NULL) {
+			return rseg_id;
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Count the number of initialized persistent rollback segment slots. */
+static
+void
+trx_sysf_get_n_rseg_slots()
+{
+	mtr_t		mtr;
+	mtr.start();
+
+	srv_available_undo_logs = 0;
+	if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) {
+		for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+			srv_available_undo_logs
+				+= trx_sysf_rseg_get_page_no(sys_header,
+							     rseg_id)
+				!= FIL_NULL;
+		}
+	}
+
+	mtr.commit();
+}
+
+/** Initialize the transaction system when creating the database. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
+{
+  mtr->start();
+  mtr->x_lock_space(fil_system.sys_space);
+  static_assert(TRX_SYS_SPACE == 0, "compatibility");
+
+  /* Create the trx sys file block in a new allocated file segment */
+  dberr_t err;
+  buf_block_t *block= fseg_create(fil_system.sys_space,
+                                  TRX_SYS + TRX_SYS_FSEG_HEADER, mtr, &err);
+  if (UNIV_UNLIKELY(!block))
+  {
+  error:
+    mtr->commit();
+    return err;
+  }
+  ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+  mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                FIL_PAGE_TYPE_TRX_SYS);
+
+  /* Reset the rollback segment slots.  Old versions of InnoDB
+  (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+  that the whole array is initialized. */
+  static_assert(256 >= TRX_SYS_N_RSEGS, "");
+  static_assert(TRX_SYS + TRX_SYS_RSEGS + 256 * TRX_SYS_RSEG_SLOT_SIZE <=
+                UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END, "");
+  mtr->write<4>(*block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+                block->page.frame, FSP_FIRST_RSEG_PAGE_NO);
+  mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SLOT_SIZE,
+              255 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+
+  buf_block_t *r= trx_rseg_header_create(fil_system.sys_space, 0, 0,
+                                         mtr, &err);
+  if (UNIV_UNLIKELY(!r))
+    goto error;
+  ut_a(r->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+  mtr->commit();
+
+  return trx_lists_init_at_db_start();
+}
+
+void trx_sys_t::create()
+{
+  ut_ad(this == &trx_sys);
+  ut_ad(!is_initialised());
+  m_initialised= true;
+  trx_list.create();
+  rw_trx_hash.init();
+}
+
+size_t trx_sys_t::history_size()
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  for (auto &rseg : rseg_array)
+  {
+    rseg.latch.rd_lock(SRW_LOCK_CALL);
+    size+= rseg.history_size;
+  }
+  for (auto &rseg : rseg_array)
+    rseg.latch.rd_unlock();
+  return size;
+}
+
+bool trx_sys_t::history_exceeds(size_t threshold)
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  bool exceeds= false;
+  size_t i;
+  for (i= 0; i < array_elements(rseg_array); i++)
+  {
+    rseg_array[i].latch.rd_lock(SRW_LOCK_CALL);
+    size+= rseg_array[i].history_size;
+    if (size > threshold)
+    {
+      exceeds= true;
+      i++;
+      break;
+    }
+  }
+  while (i)
+    rseg_array[--i].latch.rd_unlock();
+  return exceeds;
+}
+
+TPOOL_SUPPRESS_TSAN bool trx_sys_t::history_exists()
+{
+  ut_ad(is_initialised());
+  for (auto &rseg : rseg_array)
+    if (rseg.history_size)
+      return true;
+  return false;
+}
+
+TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  for (auto &rseg : rseg_array)
+    size+= rseg.history_size;
+  return size;
+}
+
+/** Create a persistent rollback segment.
+@param space_id   system or undo tablespace id
+@return pointer to new rollback segment
+@retval nullptr  on failure */
+static trx_rseg_t *trx_rseg_create(ulint space_id)
+{
+  trx_rseg_t *rseg= nullptr;
+  mtr_t mtr;
+
+  mtr.start();
+
+  if (fil_space_t *space= mtr.x_lock_space(space_id))
+  {
+    ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+    if (buf_block_t *sys_header= trx_sysf_get(&mtr))
+    {
+      ulint rseg_id= trx_sys_rseg_find_free(sys_header);
+      dberr_t err;
+      if (buf_block_t *rblock= rseg_id == ULINT_UNDEFINED
+          ? nullptr : trx_rseg_header_create(space, rseg_id, 0, &mtr, &err))
+      {
+        rseg= &trx_sys.rseg_array[rseg_id];
+        rseg->init(space, rblock->page.id().page_no());
+        ut_ad(rseg->is_persistent());
+        mtr.write<4,mtr_t::MAYBE_NOP>
+          (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE +
+           rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+           space_id);
+        mtr.write<4,mtr_t::MAYBE_NOP>
+          (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+           rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+           rseg->page_no);
+      }
+    }
+  }
+
+  mtr.commit();
+  return rseg;
+}
+
+/** Create the rollback segments.
+@return	whether the creation succeeded */
+bool trx_sys_create_rsegs()
+{
+	/* srv_available_undo_logs reflects the number of persistent
+	rollback segments that have been initialized in the
+	transaction system header page. */
+	ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
+
+	if (high_level_read_only) {
+		srv_available_undo_logs = 0;
+		return(true);
+	}
+
+	/* This is executed in single-threaded mode therefore it is not
+	necessary to use the same mtr in trx_rseg_create(). n_used cannot
+	change while the function is executing. */
+	trx_sysf_get_n_rseg_slots();
+
+	ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS);
+
+	/* The first persistent rollback segment is always initialized
+	in the system tablespace. */
+	ut_a(srv_available_undo_logs > 0);
+
+	for (ulint i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+	     i++, srv_available_undo_logs++) {
+		/* Tablespace 0 is the system tablespace.
+		Dedicated undo log tablespaces start from 1. */
+		ulint space = srv_undo_tablespaces > 0
+			? (i % srv_undo_tablespaces)
+			+ srv_undo_space_id_start
+			: TRX_SYS_SPACE;
+
+		if (!trx_rseg_create(space)) {
+			ib::error() << "Unable to allocate the"
+				" requested innodb_undo_logs";
+			return(false);
+		}
+
+		/* Increase the number of active undo
+		tablespace in case new rollback segment
+		assigned to new undo tablespace. */
+		if (space > srv_undo_tablespaces_active) {
+			srv_undo_tablespaces_active++;
+
+			ut_ad(srv_undo_tablespaces_active == space);
+		}
+	}
+
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+	ib::info info;
+	info << srv_available_undo_logs;
+	if (srv_undo_tablespaces_active) {
+		info << " rollback segments in " << srv_undo_tablespaces_active
+		<< " undo tablespaces are active.";
+	} else {
+		info << " rollback segments are active.";
+	}
+
+	return(true);
+}
+
+/** Close the transaction system on shutdown */
+void
+trx_sys_t::close()
+{
+	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+	if (!is_initialised()) {
+		return;
+	}
+
+	if (size_t size = view_count()) {
+		ib::error() << "All read views were not closed before"
+			" shutdown: " << size << " read views open";
+	}
+
+	rw_trx_hash.destroy();
+
+	/* There can't be any active transactions. */
+
+	for (ulint i = 0; i < array_elements(temp_rsegs); ++i) {
+		temp_rsegs[i].destroy();
+	}
+	for (ulint i = 0; i < array_elements(rseg_array); ++i) {
+		rseg_array[i].destroy();
+	}
+
+	ut_a(trx_list.empty());
+	trx_list.close();
+	m_initialised = false;
+}
+
+/** @return total number of active (non-prepared) transactions */
+ulint trx_sys_t::any_active_transactions()
+{
+  uint32_t total_trx= 0;
+
+  trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) {
+    if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY ||
+        (trx.state == TRX_STATE_ACTIVE && trx.id))
+      total_trx++;
+  });
+
+  return total_trx;
+}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000..f9a152e2
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2180 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
+
+#include <mysql/service_thd_error_context.h>
+
+#include "btr0sea.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "trx0xa.h"
+#include "ut0pool.h"
+#include "ut0vec.h"
+#include "log.h"
+
+#include <set>
+#include <new>
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+const byte trx_id_max_bytes[8] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/** The bit pattern corresponding to max timestamp */
+const byte timestamp_max_bytes[7] = {
+	0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
+};
+
+
+static const ulint MAX_DETAILED_ERROR_LEN = 256;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
+	trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
+}
+
+/********************************************************************//**
+Initialize transaction object.
+@param trx trx to initialize */
+static
+void
+trx_init(
+/*=====*/
+	trx_t*	trx)
+{
+	trx->state = TRX_STATE_NOT_STARTED;
+
+	trx->is_recovered = false;
+
+	trx->op_info = "";
+
+	trx->active_commit_ordered = false;
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->check_foreigns = true;
+
+	trx->check_unique_secondary = true;
+
+	trx->lock.n_rec_locks = 0;
+
+	trx->dict_operation = false;
+
+	trx->error_state = DB_SUCCESS;
+
+	trx->error_key_num = ULINT_UNDEFINED;
+
+	trx->undo_no = 0;
+
+	trx->rsegs.m_redo.rseg = NULL;
+
+	trx->rsegs.m_noredo.rseg = NULL;
+
+	trx->read_only = false;
+
+	trx->auto_commit = false;
+
+	trx->will_lock = false;
+
+	trx->bulk_insert = false;
+
+	trx->apply_online_log = false;
+
+	ut_d(trx->start_file = 0);
+
+	ut_d(trx->start_line = 0);
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	ut_ad(!trx->read_view.is_open());
+
+	trx->lock.rec_cached = 0;
+
+	trx->lock.table_cached = 0;
+#ifdef WITH_WSREP
+	ut_ad(!trx->wsrep);
+#endif /* WITH_WSREP */
+}
+
+/** For managing the life-cycle of the trx_t instance that we get
+from the pool. */
+struct TrxFactory {
+
+	/** Initializes a transaction object. It must be explicitly started
+	with trx_start_if_not_started() before using it. The default isolation
+	level is TRX_ISO_REPEATABLE_READ.
+	@param trx Transaction instance to initialise */
+	static void init(trx_t* trx)
+	{
+		/* Explicitly call the constructor of the already
+		allocated object. trx_t objects are allocated by
+		ut_zalloc_nokey() in Pool::Pool() which would not call
+		the constructors of the trx_t members. */
+		new(&trx->mod_tables) trx_mod_tables_t();
+
+		new(&trx->lock.table_locks) lock_list();
+
+		new(&trx->read_view) ReadView();
+
+		trx->rw_trx_hash_pins = 0;
+		trx_init(trx);
+
+		trx->dict_operation_lock_mode = false;
+
+		trx->detailed_error = reinterpret_cast<char*>(
+			ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
+
+		trx->lock.lock_heap = mem_heap_create_typed(
+			1024, MEM_HEAP_FOR_LOCK_HEAP);
+		pthread_cond_init(&trx->lock.cond, nullptr);
+
+		UT_LIST_INIT(trx->lock.trx_locks, &lock_t::trx_locks);
+		UT_LIST_INIT(trx->lock.evicted_tables,
+			     &dict_table_t::table_LRU);
+
+		UT_LIST_INIT(
+			trx->trx_savepoints,
+			&trx_named_savept_t::trx_savepoints);
+
+		trx->mutex_init();
+	}
+
+	/** Release resources held by the transaction object.
+	@param trx the transaction for which to release resources */
+	static void destroy(trx_t* trx)
+	{
+#ifdef __SANITIZE_ADDRESS__
+		/* Unpoison the memory for AddressSanitizer */
+		MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+		/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+		changing the state of the V bits (which indicate
+		which bits are initialized).
+		We will declare the contents as initialized.
+		We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+		MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+		ut_a(trx->magic_n == TRX_MAGIC_N);
+		ut_ad(!trx->mysql_thd);
+
+		ut_a(trx->lock.wait_lock == NULL);
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(!trx->dict_operation_lock_mode);
+
+		if (trx->lock.lock_heap != NULL) {
+			mem_heap_free(trx->lock.lock_heap);
+			trx->lock.lock_heap = NULL;
+		}
+
+		pthread_cond_destroy(&trx->lock.cond);
+
+		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+		ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+		ut_free(trx->detailed_error);
+
+		trx->mutex_destroy();
+
+		trx->mod_tables.~trx_mod_tables_t();
+
+		ut_ad(!trx->read_view.is_open());
+
+		trx->lock.table_locks.~lock_list();
+
+		trx->read_view.~ReadView();
+	}
+};
+
+/** The lock strategy for TrxPool */
+class TrxPoolLock
+{
+  mysql_mutex_t mutex;
+
+public:
+  /** Create the mutex */
+  void create()
+  {
+    mysql_mutex_init(trx_pool_mutex_key, &mutex, nullptr);
+  }
+
+  /** Acquire the mutex */
+  void enter() { mysql_mutex_lock(&mutex); }
+
+  /** Release the mutex */
+  void exit() { mysql_mutex_unlock(&mutex); }
+
+  /** Free the mutex */
+  void destroy() { mysql_mutex_destroy(&mutex); }
+};
+
+/** The lock strategy for the TrxPoolManager */
+class TrxPoolManagerLock
+{
+  mysql_mutex_t mutex;
+
+public:
+  /** Create the mutex */
+  void create()
+  {
+    mysql_mutex_init(trx_pool_manager_mutex_key, &mutex, nullptr);
+  }
+
+  /** Acquire the mutex */
+  void enter() { mysql_mutex_lock(&mutex); }
+
+  /** Release the mutex */
+  void exit() { mysql_mutex_unlock(&mutex); }
+
+  /** Free the mutex */
+  void destroy() { mysql_mutex_destroy(&mutex); }
+};
+
+/** Use explicit mutexes for the trx_t pool and its manager. */
+typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
+typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
+
+/** The trx_t pool manager */
+static trx_pools_t* trx_pools;
+
+/** Size of on trx_t pool in bytes. */
+static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
+
+/** Create the trx_t pool */
+void
+trx_pool_init()
+{
+	trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
+
+	ut_a(trx_pools != 0);
+}
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close()
+{
+	UT_DELETE(trx_pools);
+
+	trx_pools = 0;
+}
+
+/** @return an allocated transaction */
+trx_t *trx_create()
+{
+	trx_t*	trx = trx_pools->get();
+
+#ifdef __SANITIZE_ADDRESS__
+	/* Unpoison the memory for AddressSanitizer.
+	It may have been poisoned in trx_t::free().*/
+	MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+	/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+	changing the state of the V bits (which indicate
+	which bits are initialized).
+	We will declare the contents as initialized.
+	We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+	MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+	trx->assert_freed();
+
+	mem_heap_t*	heap;
+	ib_alloc_t*	alloc;
+
+	/* We just got trx from pool, it should be non locking */
+	ut_ad(!trx->will_lock);
+	ut_ad(!trx->rw_trx_hash_pins);
+
+	DBUG_LOG("trx", "Create: " << trx);
+
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+
+	alloc = ib_heap_allocator_create(heap);
+
+	trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
+
+	ut_ad(trx->mod_tables.empty());
+	ut_ad(trx->lock.n_rec_locks == 0);
+	ut_ad(trx->lock.table_cached == 0);
+	ut_ad(trx->lock.rec_cached == 0);
+	ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+	trx_sys.register_trx(trx);
+
+	return(trx);
+}
+
+/** Free the memory to trx_pools */
+void trx_t::free()
+{
+#ifdef HAVE_MEM_CHECK
+  if (xid.is_null())
+    MEM_MAKE_DEFINED(&xid, sizeof xid);
+  else
+    MEM_MAKE_DEFINED(&xid.data[xid.gtrid_length + xid.bqual_length],
+                     sizeof xid.data - (xid.gtrid_length + xid.bqual_length));
+#endif
+  MEM_CHECK_DEFINED(this, sizeof *this);
+
+  ut_ad(!n_mysql_tables_in_use);
+  ut_ad(!mysql_log_file_name);
+  ut_ad(!mysql_n_tables_locked);
+  ut_ad(!will_lock);
+  ut_ad(error_state == DB_SUCCESS);
+  ut_ad(magic_n == TRX_MAGIC_N);
+  ut_ad(!read_only);
+  ut_ad(!lock.wait_lock);
+
+  dict_operation= false;
+  trx_sys.deregister_trx(this);
+  assert_freed();
+  trx_sys.rw_trx_hash.put_pins(this);
+
+  mysql_thd= nullptr;
+
+  // FIXME: We need to avoid this heap free/alloc for each commit.
+  if (autoinc_locks)
+  {
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    /* We allocated a dedicated heap for the vector. */
+    ib_vector_free(autoinc_locks);
+    autoinc_locks= NULL;
+  }
+
+  MEM_NOACCESS(&skip_lock_inheritance_and_n_ref,
+               sizeof skip_lock_inheritance_and_n_ref);
+  /* do not poison mutex */
+  MEM_NOACCESS(&id, sizeof id);
+  MEM_NOACCESS(&state, sizeof state);
+  MEM_NOACCESS(&is_recovered, sizeof is_recovered);
+#ifdef WITH_WSREP
+  MEM_NOACCESS(&wsrep, sizeof wsrep);
+#endif
+  read_view.mem_noaccess();
+  MEM_NOACCESS(&lock, sizeof lock);
+  MEM_NOACCESS(&op_info, sizeof op_info);
+  MEM_NOACCESS(&isolation_level, sizeof isolation_level);
+  MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+  MEM_NOACCESS(&is_registered, sizeof is_registered);
+  MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
+  MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
+  MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
+  MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
+  MEM_NOACCESS(&duplicates, sizeof duplicates);
+  MEM_NOACCESS(&dict_operation, sizeof dict_operation);
+  MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
+  MEM_NOACCESS(&start_time, sizeof start_time);
+  MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
+  MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
+  MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
+  MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
+  MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
+  MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
+  MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
+  MEM_NOACCESS(&error_state, sizeof error_state);
+  MEM_NOACCESS(&error_info, sizeof error_info);
+  MEM_NOACCESS(&error_key_num, sizeof error_key_num);
+  MEM_NOACCESS(&graph, sizeof graph);
+  MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
+  MEM_NOACCESS(&undo_no, sizeof undo_no);
+  MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
+  MEM_NOACCESS(&rsegs, sizeof rsegs);
+  MEM_NOACCESS(&roll_limit, sizeof roll_limit);
+  MEM_NOACCESS(&in_rollback, sizeof in_rollback);
+  MEM_NOACCESS(&pages_undone, sizeof pages_undone);
+  MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
+  MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
+  MEM_NOACCESS(&read_only, sizeof read_only);
+  MEM_NOACCESS(&auto_commit, sizeof auto_commit);
+  MEM_NOACCESS(&will_lock, sizeof will_lock);
+  MEM_NOACCESS(&fts_trx, sizeof fts_trx);
+  MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
+  MEM_NOACCESS(&flush_tables, sizeof flush_tables);
+#ifdef UNIV_DEBUG
+  MEM_NOACCESS(&start_line, sizeof start_line);
+  MEM_NOACCESS(&start_file, sizeof start_file);
+#endif /* UNIV_DEBUG */
+  MEM_NOACCESS(&xid, sizeof xid);
+  MEM_NOACCESS(&mod_tables, sizeof mod_tables);
+  MEM_NOACCESS(&detailed_error, sizeof detailed_error);
+  MEM_NOACCESS(&magic_n, sizeof magic_n);
+  MEM_NOACCESS(&apply_online_log, sizeof apply_online_log);
+  trx_pools->mem_free(this);
+}
+
+/** Transition to committed state, to release implicit locks. */
+TRANSACTIONAL_INLINE inline void trx_t::commit_state()
+{
+  ut_ad(state == TRX_STATE_PREPARED
+	|| state == TRX_STATE_PREPARED_RECOVERED
+	|| state == TRX_STATE_ACTIVE);
+  /* This makes the transaction committed in memory and makes its
+  changes to data visible to other transactions. NOTE that there is a
+  small discrepancy from the strict formal visibility rules here: a
+  user of the database can see modifications made by another
+  transaction T even before the necessary redo log segment has been
+  flushed to the disk. If the database happens to crash before the
+  flush, the user has seen modifications from T which will never be a
+  committed transaction. However, any transaction T2 which sees the
+  modifications of the committing transaction T, and which also itself
+  makes modifications to the database, will get an lsn larger than the
+  committing transaction T. In the case where the log flush fails, and
+  T never gets committed, also T2 will never get committed. */
+  TMTrxGuard tg{*this};
+  state= TRX_STATE_COMMITTED_IN_MEMORY;
+  ut_ad(id || !is_referenced());
+}
+
+/** Release any explicit locks of a committing transaction. */
+inline void trx_t::release_locks()
+{
+  DEBUG_SYNC_C("trx_t_release_locks_enter");
+  DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
+  DBUG_ASSERT(!is_referenced());
+
+  if (UT_LIST_GET_LEN(lock.trx_locks))
+  {
+    lock_release(this);
+    ut_ad(!lock.n_rec_locks);
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    mem_heap_empty(lock.lock_heap);
+  }
+
+  lock.table_locks.clear();
+  reset_skip_lock_inheritance();
+  id= 0;
+  while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+  {
+    UT_LIST_REMOVE(lock.evicted_tables, table);
+    dict_mem_table_free(table);
+  }
+  DEBUG_SYNC_C("after_trx_committed_in_memory");
+}
+
+/** At shutdown, frees a transaction object. */
+TRANSACTIONAL_TARGET void trx_free_at_shutdown(trx_t *trx)
+{
+	ut_ad(trx->is_recovered);
+	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
+	     || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+	     || (trx_state_eq(trx, TRX_STATE_ACTIVE)
+		 && (!srv_was_started
+		     || srv_operation == SRV_OPERATION_RESTORE
+		     || srv_operation == SRV_OPERATION_RESTORE_EXPORT
+		     || srv_read_only_mode
+		     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+		     || (!srv_is_being_started
+		         && !srv_undo_sources && srv_fast_shutdown))));
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	ut_d(trx->apply_online_log = false);
+	trx->commit_state();
+	trx->release_locks();
+	trx->mod_tables.clear();
+	trx_undo_free_at_shutdown(trx);
+
+	ut_a(!trx->read_only);
+
+	DBUG_LOG("trx", "Free prepared: " << trx);
+	trx->state = TRX_STATE_NOT_STARTED;
+	ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
+	trx->free();
+}
+
+
+/**
+  Disconnect a prepared transaction from MySQL
+  @param[in,out] trx transaction
+*/
+void trx_disconnect_prepared(trx_t *trx)
+{
+  ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+  ut_ad(trx->mysql_thd);
+  ut_ad(!trx->mysql_log_file_name);
+  trx->read_view.close();
+  trx_sys.trx_list.freeze();
+  trx->is_recovered= true;
+  trx->mysql_thd= NULL;
+  trx_sys.trx_list.unfreeze();
+  /* todo/fixme: suggest to do it at innodb prepare */
+  trx->will_lock= false;
+  trx_sys.rw_trx_hash.put_pins(trx);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Resurrect the table locks for a resurrected transaction. */
+static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo)
+{
+  ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+        trx_state_eq(trx, TRX_STATE_PREPARED));
+  ut_ad(undo.rseg == trx->rsegs.m_redo.rseg);
+
+  if (undo.empty())
+    return DB_SUCCESS;
+
+  mtr_t mtr;
+  std::map<table_id_t, bool> tables;
+  mtr.start();
+
+  dberr_t err;
+  if (buf_block_t *block=
+      buf_page_get_gen(page_id_t(trx->rsegs.m_redo.rseg->space->id,
+                                 undo.top_page_no), 0, RW_S_LATCH, nullptr,
+                       BUF_GET, &mtr, &err))
+  {
+    buf_block_t *undo_block= block;
+    const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset;
+
+    do
+    {
+      ulint type;
+      undo_no_t undo_no;
+      table_id_t table_id;
+      ulint cmpl_info;
+      bool updated_extern;
+
+      if (undo_block != block)
+      {
+        mtr.memo_release(undo_block, MTR_MEMO_PAGE_S_FIX);
+        undo_block= block;
+      }
+      trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+                            &updated_extern, &undo_no, &table_id);
+      tables.emplace(table_id, type == TRX_UNDO_EMPTY);
+      undo_rec= trx_undo_get_prev_rec(block, page_offset(undo_rec),
+                                      undo.hdr_page_no, undo.hdr_offset,
+                                      true, &mtr);
+    }
+    while (undo_rec);
+  }
+
+  mtr.commit();
+
+  if (err != DB_SUCCESS)
+    return err;
+
+  for (auto p : tables)
+  {
+    if (dict_table_t *table=
+        dict_table_open_on_id(p.first, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE))
+    {
+      if (!table->is_readable())
+      {
+        dict_sys.lock(SRW_LOCK_CALL);
+        table->release();
+        dict_sys.remove(table);
+        dict_sys.unlock();
+        continue;
+      }
+
+      if (trx->state == TRX_STATE_PREPARED)
+        trx->mod_tables.emplace(table, 0);
+
+      lock_table_resurrect(table, trx, p.second ? LOCK_X : LOCK_IX);
+
+      DBUG_LOG("ib_trx",
+               "resurrect " << ib::hex(trx->id) << " lock on " << table->name);
+      table->release();
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/**
+  Resurrect the transactions that were doing inserts/updates the time of the
+  crash, they need to be undone.
+*/
+static dberr_t trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+                             time_t start_time, ulonglong start_time_micro,
+                             uint64_t *rows_to_undo)
+{
+  trx_state_t state;
+  /*
+    This is single-threaded startup code, we do not need the
+    protection of trx->mutex here.
+  */
+  switch (undo->state)
+  {
+  case TRX_UNDO_ACTIVE:
+    state= TRX_STATE_ACTIVE;
+    break;
+  case TRX_UNDO_PREPARED:
+    /*
+      Prepared transactions are left in the prepared state
+      waiting for a commit or abort decision from MySQL
+    */
+    state= TRX_STATE_PREPARED;
+    sql_print_information("InnoDB: Transaction " TRX_ID_FMT
+                          " was in the XA prepared state.", undo->trx_id);
+    break;
+  default:
+    return DB_SUCCESS;
+  }
+
+  trx_t *trx= trx_create();
+  trx->state= state;
+  ut_d(trx->start_file= __FILE__);
+  ut_d(trx->start_line= __LINE__);
+
+  trx->rsegs.m_redo.undo= undo;
+  trx->undo_no= undo->top_undo_no + 1;
+  trx->rsegs.m_redo.rseg= rseg;
+  /*
+    For transactions with active data will not have rseg size = 1
+    or will not qualify for purge limit criteria. So it is safe to increment
+    this trx_ref_count w/o mutex protection.
+  */
+  trx->rsegs.m_redo.rseg->acquire();
+  trx->xid= undo->xid;
+  trx->id= undo->trx_id;
+  trx->is_recovered= true;
+  trx->start_time= start_time;
+  trx->start_time_micro= start_time_micro;
+  trx->dict_operation= undo->dict_operation;
+
+  trx_sys.rw_trx_hash.insert(trx);
+  trx_sys.rw_trx_hash.put_pins(trx);
+  if (trx_state_eq(trx, TRX_STATE_ACTIVE))
+    *rows_to_undo+= trx->undo_no;
+  return trx_resurrect_table_locks(trx, *undo);
+}
+
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start()
+{
+	ut_a(srv_is_being_started);
+	ut_ad(!srv_was_started);
+
+	if (srv_operation == SRV_OPERATION_RESTORE) {
+		/* mariabackup --prepare only deals with
+		the redo log and the data files, not with
+		transactions or the data dictionary. */
+		return trx_rseg_array_init();
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		return DB_SUCCESS;
+	}
+
+	purge_sys.create();
+	dberr_t err = trx_rseg_array_init();
+
+	if (err != DB_SUCCESS) {
+corrupted:
+		ib::info() << "Retry with innodb_force_recovery=5";
+		return err;
+	}
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions. */
+	const time_t	start_time	= time(NULL);
+	const ulonglong	start_time_micro= microsecond_interval_timer();
+	uint64_t	rows_to_undo	= 0;
+
+	for (auto& rseg : trx_sys.rseg_array) {
+		trx_undo_t*	undo;
+
+		/* Some rollback segment may be unavailable,
+		especially if the server was previously run with a
+		non-default value of innodb_undo_logs. */
+		if (!rseg.space) {
+			continue;
+		}
+		/* Resurrect other transactions. */
+		for (undo = UT_LIST_GET_FIRST(rseg.undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t *trx = trx_sys.find(0, undo->trx_id, false);
+			if (!trx) {
+				err = trx_resurrect(undo, &rseg, start_time,
+						    start_time_micro,
+						    &rows_to_undo);
+			} else {
+				ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+				      trx_state_eq(trx, TRX_STATE_PREPARED));
+				ut_ad(trx->start_time == start_time);
+				ut_ad(trx->is_recovered);
+				ut_ad(trx->rsegs.m_redo.rseg == &rseg);
+				ut_ad(rseg.is_referenced());
+
+				trx->rsegs.m_redo.undo = undo;
+				if (undo->top_undo_no >= trx->undo_no) {
+					if (trx_state_eq(trx,
+							 TRX_STATE_ACTIVE)) {
+						rows_to_undo -= trx->undo_no;
+						rows_to_undo +=
+							undo->top_undo_no + 1;
+					}
+
+					trx->undo_no = undo->top_undo_no + 1;
+				}
+				err = trx_resurrect_table_locks(trx, *undo);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto corrupted;
+			}
+		}
+	}
+
+	if (const auto size = trx_sys.rw_trx_hash.size()) {
+		ib::info() << size
+			<< " transaction(s) which must be rolled back or"
+			" cleaned up in total " << rows_to_undo
+			<< " row operations to undo";
+		ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
+	}
+
+	purge_sys.clone_oldest_view<true>();
+	return DB_SUCCESS;
+}
+
+/** Assign a persistent rollback segment in a round-robin fashion,
+evenly distributed between 0 and innodb_undo_logs-1
+@return	persistent rollback segment
+@retval	NULL	if innodb_read_only */
+static trx_rseg_t* trx_assign_rseg_low()
+{
+	if (high_level_read_only) {
+		ut_ad(!srv_available_undo_logs);
+		return(NULL);
+	}
+
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+	/* The first slot is always assigned to the system tablespace. */
+	ut_ad(trx_sys.rseg_array[0].space == fil_system.sys_space);
+
+	/* Choose a rollback segment evenly distributed between 0 and
+	innodb_undo_logs-1 in a round-robin fashion, skipping those
+	undo tablespaces that are scheduled for truncation. */
+	static Atomic_counter<unsigned>	rseg_slot;
+	unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+	trx_rseg_t*	rseg;
+
+#ifdef UNIV_DEBUG
+	ulint	start_scan_slot = slot;
+	bool	look_for_rollover = false;
+#endif /* UNIV_DEBUG */
+
+	bool	allocated = false;
+
+	do {
+		for (;;) {
+			rseg = &trx_sys.rseg_array[slot];
+
+#ifdef UNIV_DEBUG
+			/* Ensure that we are not revisiting the same
+			slot that we have already inspected. */
+			if (look_for_rollover) {
+				ut_ad(start_scan_slot != slot);
+			}
+			look_for_rollover = true;
+#endif /* UNIV_DEBUG */
+
+			ut_d(if (!trx_rseg_n_slots_debug))
+			slot = (slot + 1) % TRX_SYS_N_RSEGS;
+
+			if (!rseg->space) {
+				continue;
+			}
+
+			ut_ad(rseg->is_persistent());
+
+			if (rseg->space != fil_system.sys_space) {
+				if (rseg->skip_allocation()
+				    || !srv_undo_tablespaces) {
+					continue;
+				}
+			} else if (const fil_space_t *space =
+				   trx_sys.rseg_array[slot].space) {
+				if (space != fil_system.sys_space
+				    && srv_undo_tablespaces > 0) {
+					/** If dedicated
+					innodb_undo_tablespaces have
+					been configured, try to use them
+					instead of the system tablespace. */
+					continue;
+				}
+			}
+
+			break;
+		}
+
+		/* By now we have only selected the rseg but not marked it
+		allocated. By marking it allocated we are ensuring that it will
+		never be selected for UNDO truncate purge. */
+		allocated = rseg->acquire_if_available();
+	} while (!allocated);
+
+	ut_ad(rseg->is_referenced());
+	ut_ad(rseg->is_persistent());
+	return(rseg);
+}
+
+/** Assign a rollback segment for modifying temporary tables.
+@return the assigned rollback segment */
+trx_rseg_t *trx_t::assign_temp_rseg()
+{
+	ut_ad(!rsegs.m_noredo.rseg);
+	ut_ad(!is_autocommit_non_locking());
+	compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
+
+	/* Choose a temporary rollback segment between 0 and 127
+	in a round-robin fashion. */
+	static Atomic_counter<unsigned> rseg_slot;
+	trx_rseg_t*	rseg = &trx_sys.temp_rsegs[
+		rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
+	ut_ad(!rseg->is_persistent());
+	rsegs.m_noredo.rseg = rseg;
+
+	if (id == 0) {
+		trx_sys.register_rw(this);
+	}
+
+	return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+	trx_t*	trx,		/*!< in: transaction */
+	bool	read_write)	/*!< in: true if read-write transaction */
+{
+	ut_ad(!trx->in_rollback);
+	ut_ad(!trx->is_recovered);
+	ut_ad(trx->start_line != 0);
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->roll_limit == 0);
+	ut_ad(trx->error_state == DB_SUCCESS);
+	ut_ad(trx->rsegs.m_redo.rseg == NULL);
+	ut_ad(trx->rsegs.m_noredo.rseg == NULL);
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	/* Check whether it is an AUTOCOMMIT SELECT */
+	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+	trx->read_only = srv_read_only_mode
+		|| (!trx->dict_operation
+		    && thd_trx_is_read_only(trx->mysql_thd));
+
+	if (!trx->auto_commit) {
+		trx->will_lock = true;
+	} else if (!trx->will_lock) {
+		trx->read_only = true;
+	}
+
+#ifdef WITH_WSREP
+	trx->xid.null();
+#endif /* WITH_WSREP */
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(trx->lock.table_locks.empty());
+
+	/* No other thread can access this trx object through rw_trx_hash,
+	still it can be found through trx_sys.trx_list. Sometimes it's
+	possible to indirectly protect trx_t::state by freezing
+	trx_sys.trx_list.
+
+	For now we update it without mutex protection, because original code
+	did it this way. It has to be reviewed and fixed properly. */
+	trx->state = TRX_STATE_ACTIVE;
+
+	/* By default all transactions are in the read-only list unless they
+	are non-locking auto-commit read only transactions or background
+	(internal) transactions. Note: Transactions marked explicitly as
+	read only can write to temporary tables, we put those on the RO
+	list too. */
+
+	if (!trx->read_only
+	    && (!trx->mysql_thd || read_write || trx->dict_operation)) {
+
+		/* Temporary rseg is assigned only if the transaction
+		updates a temporary table */
+		trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+		ut_ad(trx->rsegs.m_redo.rseg != 0
+		      || srv_read_only_mode
+		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+		trx_sys.register_rw(trx);
+	} else {
+		if (!trx->is_autocommit_non_locking()) {
+
+			/* If this is a read-only transaction that is writing
+			to a temporary table then it needs a transaction id
+			to write to the temporary table. */
+
+			if (read_write) {
+				ut_ad(!srv_read_only_mode);
+				trx_sys.register_rw(trx);
+			}
+		} else {
+			ut_ad(!read_write);
+		}
+	}
+
+	trx->start_time = time(NULL);
+	trx->start_time_micro = trx->mysql_thd
+		? thd_start_utime(trx->mysql_thd)
+		: microsecond_interval_timer();
+
+	ut_a(trx->error_state == DB_SUCCESS);
+}
+
+/** Set the serialisation number for a persistent committed transaction.
+@param[in,out]	trx	committed transaction with persistent changes */
+static
+void
+trx_serialise(trx_t* trx)
+{
+	trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
+	ut_ad(rseg);
+
+	if (rseg->last_page_no == FIL_NULL) {
+		mysql_mutex_lock(&purge_sys.pq_mutex);
+	}
+
+	trx_sys.assign_new_trx_no(trx);
+
+	/* If the rollback segment is not empty then the
+	new trx_t::no can't be less than any trx_t::no
+	already in the rollback segment. User threads only
+	produce events when a rollback segment is empty. */
+	if (rseg->last_page_no == FIL_NULL) {
+		purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
+							*rseg));
+		mysql_mutex_unlock(&purge_sys.pq_mutex);
+	}
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment. */
+static
+void
+trx_write_serialisation_history(
+/*============================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE to some
+	other state: these modifications to the file data structure define
+	the transaction as committed in the file based domain, at the
+	serialization point of the log sequence number lsn obtained below. */
+
+	/* We have to hold the rseg mutex because update log headers have
+	to be put to the history list in the (serialisation) order of the
+	UNDO trx number. This is required for the purge in-memory data
+	structures too. */
+
+	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+		/* Undo log for temporary tables is discarded at transaction
+		commit. There is no purge for temporary tables, and also no
+		MVCC, because they are private to a session. */
+
+		mtr_t	temp_mtr;
+		temp_mtr.start();
+		temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
+		trx_undo_set_state_at_finish(undo, &temp_mtr);
+		temp_mtr.commit();
+	}
+
+	trx_rseg_t*	rseg = trx->rsegs.m_redo.rseg;
+	if (!rseg) {
+		ut_ad(!trx->rsegs.m_redo.undo);
+		return;
+	}
+
+	trx_undo_t*& undo = trx->rsegs.m_redo.undo;
+
+	if (!undo) {
+		return;
+	}
+
+	ut_ad(!trx->read_only);
+	ut_ad(!undo || undo->rseg == rseg);
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+	/* Assign the transaction serialisation number and add any
+	undo log to the purge queue. */
+	trx_serialise(trx);
+	if (undo) {
+		UT_LIST_REMOVE(rseg->undo_list, undo);
+		trx_purge_add_undo_to_history(trx, undo, mtr);
+	}
+
+	rseg->latch.wr_unlock();
+
+	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+	fts_trx_table_t*	ftt)	    /* in: FTS trx table */
+{
+	fts_t*		  fts = ftt->table->fts;
+	fts_doc_ids_t*	  doc_ids = ftt->added_doc_ids;
+
+	ut_a(fts->add_wq);
+
+	mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+	ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+	/* fts_trx_table_t no longer owns the list. */
+	ftt->added_doc_ids = NULL;
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	is_commit)	/*!< in: true if the transaction was
+				committed, false if it was rolled back. */
+{
+	if (is_commit) {
+		const ib_rbt_node_t*	node;
+		ib_rbt_t*		tables;
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_last(trx->fts_trx->savepoints));
+
+		tables = savepoint->tables;
+
+		for (node = rbt_first(tables);
+		     node;
+		     node = rbt_next(tables, node)) {
+			fts_trx_table_t**	ftt;
+
+			ftt = rbt_value(fts_trx_table_t*, node);
+
+			if ((*ftt)->added_doc_ids) {
+				trx_finalize_for_fts_table(*ftt);
+			}
+		}
+	}
+
+	fts_trx_free(trx->fts_trx);
+	trx->fts_trx = NULL;
+}
+
+extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD);
+extern "C" void  thd_decrement_pending_ops(MYSQL_THD);
+
+
+#include "../log/log0sync.h"
+
+/*
+  If required, initiates write and optionally flush of the log to
+  disk
+  @param lsn   LSN up to which logs are to be flushed.
+  @param trx   transaction; if trx->state is PREPARED, the function will
+  also wait for the flush to complete.
+*/
+static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx)
+{
+  if (!srv_flush_log_at_trx_commit)
+    return;
+
+  if (log_sys.get_flushed_lsn() > lsn)
+    return;
+
+  const bool flush= srv_file_flush_method != SRV_NOSYNC &&
+    (srv_flush_log_at_trx_commit & 1);
+
+  if (trx->state == TRX_STATE_PREPARED)
+  {
+    /* XA, which is used with binlog as well.
+    Be conservative, use synchronous wait.*/
+sync:
+    log_write_up_to(lsn, flush);
+    return;
+  }
+
+  completion_callback cb;
+  if ((cb.m_param = thd_increment_pending_ops(trx->mysql_thd)))
+  {
+    cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops;
+    log_write_up_to(lsn, flush, false, &cb);
+  }
+  else
+    goto sync;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed(
+/*====================*/
+	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
+			flushed. */
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "flushing log";
+	trx_flush_log_if_needed_low(lsn, trx);
+	trx->op_info = "";
+}
+
+/** Process tables that were modified by the committing transaction. */
+inline void trx_t::commit_tables()
+{
+  if (undo_no && !mod_tables.empty())
+  {
+    const trx_id_t max_trx_id= trx_sys.get_max_trx_id();
+    const auto now= start_time;
+
+    for (const auto &p : mod_tables)
+    {
+      dict_table_t *table= p.first;
+      table->update_time= now;
+      table->query_cache_inv_trx_id= max_trx_id;
+    }
+  }
+}
+
+/** Evict a table definition due to the rollback of ALTER TABLE.
+@param table_id   table identifier
+@param reset_only whether to only reset dict_table_t::def_trx_id */
+void trx_t::evict_table(table_id_t table_id, bool reset_only)
+{
+	ut_ad(in_rollback);
+
+	dict_table_t* table = dict_sys.find_table(table_id);
+	if (!table) {
+		return;
+	}
+
+	table->def_trx_id = 0;
+
+	if (auto ref_count = table->get_ref_count()) {
+		/* This must be a DDL operation that is being rolled
+		back in an active connection. */
+		ut_a(ref_count == 1);
+		ut_ad(!is_recovered);
+		ut_ad(mysql_thd);
+		return;
+	}
+
+	if (reset_only) {
+		return;
+	}
+
+	/* This table should only be locked by this transaction, if at all. */
+	ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
+	const bool locked = UT_LIST_GET_LEN(table->locks);
+	ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
+	dict_sys.remove(table, true, locked);
+	if (locked) {
+		UT_LIST_ADD_FIRST(lock.evicted_tables, table);
+	}
+}
+
+TRANSACTIONAL_INLINE inline void trx_t::commit_in_memory(const mtr_t *mtr)
+{
+  /* We already detached from rseg in trx_write_serialisation_history() */
+  ut_ad(!rsegs.m_redo.undo);
+  must_flush_log_later= false;
+  read_view.close();
+
+  if (is_autocommit_non_locking())
+  {
+    ut_ad(id == 0);
+    ut_ad(read_only);
+    ut_ad(!will_lock);
+    ut_a(!is_recovered);
+    ut_ad(!rsegs.m_redo.rseg);
+    ut_ad(!rsegs.m_redo.undo);
+    ut_ad(mysql_thd);
+    ut_ad(state == TRX_STATE_ACTIVE);
+
+    /* Note: We do not have to hold any lock_sys latch here, because
+    this is a non-locking transaction. */
+    ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+
+    /* This state change is not protected by any mutex, therefore
+    there is an inherent race here around state transition during
+    printouts. We ignore this race for the sake of efficiency.
+    However, the freezing of trx_sys.trx_list will protect the trx_t
+    instance and it cannot be removed from the trx_list and freed
+    without first unfreezing trx_list. */
+    state= TRX_STATE_NOT_STARTED;
+
+    MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+
+    DBUG_LOG("trx", "Autocommit in memory: " << this);
+  }
+  else
+  {
+#ifdef UNIV_DEBUG
+    if (!UT_LIST_GET_LEN(lock.trx_locks))
+      for (auto l : lock.table_locks)
+        ut_ad(!l);
+#endif /* UNIV_DEBUG */
+    commit_state();
+
+    if (id)
+    {
+      trx_sys.deregister_rw(this);
+
+      /* Wait for any implicit-to-explicit lock conversions to cease,
+      so that there will be no race condition in lock_release(). */
+      while (UNIV_UNLIKELY(is_referenced()))
+        LF_BACKOFF();
+    }
+    else
+      ut_ad(read_only || !rsegs.m_redo.rseg);
+
+    if (read_only || !rsegs.m_redo.rseg)
+    {
+      MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+    }
+    else
+    {
+      commit_tables();
+      MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+      is_recovered= false;
+    }
+
+    if (UNIV_LIKELY(!dict_operation))
+      release_locks();
+  }
+
+  if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
+    /* This is safe due to us having detached the persistent undo log. */
+    rseg->release();
+
+  if (mtr)
+  {
+    if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+    {
+      ut_ad(undo->rseg == rsegs.m_noredo.rseg);
+      trx_undo_commit_cleanup(undo);
+      undo= nullptr;
+    }
+
+    /* NOTE that we could possibly make a group commit more efficient
+    here: call std::this_thread::yield() here to allow also other trxs to come
+    to commit! */
+
+    /*-------------------------------------*/
+
+    /* Depending on the my.cnf options, we may now write the log
+    buffer to the log files, making the transaction durable if the OS
+    does not crash. We may also flush the log files to disk, making
+    the transaction durable also at an OS crash or a power outage.
+
+    The idea in InnoDB's group commit is that a group of transactions
+    gather behind a trx doing a physical disk write to log files, and
+    when that physical write has been completed, one of those
+    transactions does a write which commits the whole group. Note that
+    this group commit will only bring benefit if there are > 2 users
+    in the database. Then at least 2 users can gather behind one doing
+    the physical log write to disk.
+
+    If we are calling trx_t::commit() under prepare_commit_mutex, we
+    will delay possible log write and flush to a separate function
+    trx_commit_complete_for_mysql(), which is only called when the
+    thread has released the mutex. This is to make the group commit
+    algorithm to work. Otherwise, the prepare_commit mutex would
+    serialize all commits and prevent a group of transactions from
+    gathering. */
+
+    commit_lsn= undo_no || !xid.is_null() ? mtr->commit_lsn() : 0;
+    if (!commit_lsn)
+      /* Nothing to be done. */;
+    else if (flush_log_later)
+      /* Do nothing yet */
+      must_flush_log_later= true;
+    else if (srv_flush_log_at_trx_commit)
+      trx_flush_log_if_needed(commit_lsn, this);
+  }
+
+  ut_ad(!rsegs.m_noredo.undo);
+
+  savepoints_discard();
+
+  if (fts_trx)
+    trx_finalize_for_fts(this, undo_no != 0);
+
+#ifdef WITH_WSREP
+  /* Serialization history has been written and the transaction is
+  committed in memory, which makes this commit ordered. Release commit
+  order critical section. */
+  if (wsrep)
+  {
+    wsrep= false;
+    wsrep_commit_ordered(mysql_thd);
+  }
+#endif /* WITH_WSREP */
+  lock.was_chosen_as_deadlock_victim= false;
+}
+
+void trx_t::commit_cleanup()
+{
+  ut_ad(!dict_operation);
+  ut_ad(!was_dict_operation);
+
+  mutex.wr_lock();
+  state= TRX_STATE_NOT_STARTED;
+  mod_tables.clear();
+
+  assert_freed();
+  trx_init(this);
+  mutex.wr_unlock();
+
+  ut_a(error_state == DB_SUCCESS);
+}
+
+/** Commit the transaction in a mini-transaction.
+@param mtr  mini-transaction (if there are any persistent modifications) */
+TRANSACTIONAL_TARGET void trx_t::commit_low(mtr_t *mtr)
+{
+  ut_ad(!mtr || mtr->is_active());
+  ut_d(bool aborted= in_rollback && error_state == DB_DEADLOCK);
+  ut_ad(!mtr == (aborted || !has_logged()));
+  ut_ad(!mtr || !aborted);
+
+  if (fts_trx && undo_no)
+  {
+    ut_a(!is_autocommit_non_locking());
+    /* MDEV-24088 FIXME: Invoke fts_commit() earlier (before possible
+    XA PREPARE), so that we will be able to return an error and rollback
+    the transaction, instead of violating consistency!
+
+    The original claim about DB_DUPLICATE KEY was:
+    This is a possible scenario if there is a crash between
+    insert to DELETED table committing and transaction committing. The
+    fix would be able to return error from this function */
+    if (ut_d(dberr_t error=) fts_commit(this))
+      ut_ad(error == DB_DUPLICATE_KEY || error == DB_LOCK_WAIT_TIMEOUT);
+  }
+
+#ifdef ENABLED_DEBUG_SYNC
+  const bool debug_sync= mysql_thd && has_logged_persistent();
+#endif
+
+  if (mtr)
+  {
+    if (UNIV_UNLIKELY(apply_online_log))
+      apply_log();
+    trx_write_serialisation_history(this, mtr);
+
+    /* The following call commits the mini-transaction, making the
+    whole transaction committed in the file-based world, at this log
+    sequence number. The transaction becomes 'durable' when we write
+    the log to disk, but in the logical sense the commit in the
+    file-based data structures (undo logs etc.) happens here.
+
+    NOTE that transaction numbers, which are assigned only to
+    transactions with an update undo log, do not necessarily come in
+    exactly the same order as commit lsn's, if the transactions have
+    different rollback segments. To get exactly the same order we
+    should hold the kernel mutex up to this point, adding to the
+    contention of the kernel mutex. However, if a transaction T2 is
+    able to see modifications made by a transaction T1, T2 will always
+    get a bigger transaction number and a bigger commit lsn than T1. */
+
+    mtr->commit();
+  }
+#ifdef ENABLED_DEBUG_SYNC
+  if (debug_sync)
+    DEBUG_SYNC_C("before_trx_state_committed_in_memory");
+#endif
+
+  commit_in_memory(mtr);
+}
+
+
+void trx_t::commit_persist()
+{
+  mtr_t *mtr= nullptr;
+  mtr_t local_mtr;
+
+  if (has_logged())
+  {
+    mtr= &local_mtr;
+    local_mtr.start();
+  }
+  commit_low(mtr);
+}
+
+
+void trx_t::commit()
+{
+  ut_ad(!was_dict_operation);
+  ut_d(was_dict_operation= dict_operation);
+  dict_operation= false;
+  commit_persist();
+  ut_d(was_dict_operation= false);
+  ut_d(for (const auto &p : mod_tables) ut_ad(!p.second.is_dropped()));
+  commit_cleanup();
+}
+
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx->mutex
+	here, because the commit or rollback should be invoked for a
+	running (or recovered prepared) transaction that is associated
+	with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, true);
+		/* fall through */
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		trx->lock.wait_thr = NULL;
+		return;
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		trx_t*	trx;
+
+		node->state = COMMIT_NODE_WAIT;
+
+		trx = thr_get_trx(thr);
+
+		ut_a(trx->lock.wait_thr == NULL);
+
+		trx_commit_or_rollback_prepare(trx);
+
+		trx->commit();
+		ut_ad(trx->lock.wait_thr == NULL);
+
+		thr = NULL;
+	} else {
+		ut_ad(node->state == COMMIT_NODE_WAIT);
+
+		node->state = COMMIT_NODE_SEND;
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		return DB_SUCCESS;
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		trx->op_info = "committing";
+		trx->commit();
+		trx->op_info = "";
+		return(DB_SUCCESS);
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	if (trx->id != 0
+	    || !trx->must_flush_log_later
+	    || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) {
+
+		return;
+	}
+
+	trx_flush_log_if_needed(trx->commit_lsn, trx);
+
+	trx->must_flush_log_later = false;
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		trx->undo_no = 0;
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		if (trx->fts_trx != NULL) {
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			return;
+		}
+
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+		trx->end_bulk_insert();
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: trx->lock.n_rec_locks */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+	if (const trx_id_t id = trx->id) {
+		fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+	} else {
+		fprintf(f, "TRANSACTION (%p)", trx);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		fputs(", not started", f);
+		goto state_ok;
+	case TRX_STATE_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		goto state_ok;
+	}
+	fprintf(f, ", state %lu", (ulong) trx->state);
+	ut_ad(0);
+state_ok:
+	const char* op_info = trx->op_info;
+
+	if (*op_info) {
+		putc(' ', f);
+		fputs(op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	bool newline = true;
+
+	if (trx->in_rollback) { /* dirty read for performance reasons */
+		fputs("ROLLING BACK ", f);
+	} else if (trx->lock.wait_lock) {
+		fputs("LOCK WAIT ", f);
+	} else {
+		newline = false;
+	}
+
+	if (n_trx_locks > 0 || heap_size > 400) {
+		newline = true;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) n_trx_locks,
+			(ulong) heap_size,
+			(ulong) n_rec_locks);
+	}
+
+	if (trx->undo_no != 0) {
+		newline = true;
+		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(
+			f, trx->mysql_thd, static_cast<uint>(max_query_len));
+	}
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys.latch.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	lock_sys.assert_locked();
+
+	trx_print_low(f, trx, max_query_len,
+		      trx->lock.n_rec_locks,
+		      UT_LIST_GET_LEN(trx->lock.trx_locks),
+		      mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.latch. */
+TRANSACTIONAL_TARGET
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+  ulint n_rec_locks, n_trx_locks, heap_size;
+  {
+    TMLockMutexGuard g{SRW_LOCK_CALL};
+    n_rec_locks= trx->lock.n_rec_locks;
+    n_trx_locks= UT_LIST_GET_LEN(trx->lock.trx_locks);
+    heap_size= mem_heap_get_size(trx->lock.lock_heap);
+  }
+
+  trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size);
+}
+
+/** Prepare a transaction.
+@return	log sequence number that makes the XA PREPARE durable
+@retval	0	if no changes needed to be made durable */
+static lsn_t trx_prepare_low(trx_t *trx)
+{
+	ut_ad(!trx->is_recovered);
+
+	mtr_t	mtr;
+
+	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+		ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+
+		mtr.start();
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+		mtr.commit();
+	}
+
+	trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+	if (!undo) {
+		/* There were no changes to persistent tables. */
+		return(0);
+	}
+
+	ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+
+	mtr.start();
+
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE to
+	TRX_UNDO_PREPARED: these modifications to the file data
+	structure define the transaction as prepared in the file-based
+	world, at the serialization point of lsn. */
+	trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+
+	/* Make the XA PREPARE durable. */
+	mtr.commit();
+	ut_ad(mtr.commit_lsn() > 0);
+	return(mtr.commit_lsn());
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+TRANSACTIONAL_TARGET
+static
+void
+trx_prepare(
+/*========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Only fresh user transactions can be prepared.
+	Recovered transactions cannot. */
+	ut_a(!trx->is_recovered);
+
+	lsn_t	lsn = trx_prepare_low(trx);
+
+	DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
+
+	ut_a(trx->state == TRX_STATE_ACTIVE);
+	{
+		TMTrxGuard tg{*trx};
+		trx->state = TRX_STATE_PREPARED;
+	}
+
+	if (lsn) {
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		We must not be holding any mutexes or latches here. */
+
+		trx_flush_log_if_needed(lsn, trx);
+
+		if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
+		    || trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+			/* Do not release any locks at the
+			SERIALIZABLE isolation level. */
+		} else if (!trx->mysql_thd
+			   || thd_sql_command(trx->mysql_thd)
+			   != SQLCOM_XA_PREPARE) {
+			/* Do not release locks for XA COMMIT ONE PHASE
+			or for internal distributed transactions
+			(XID::get_my_xid() would be nonzero). */
+		} else {
+			lock_release_on_prepare(trx);
+		}
+	}
+}
+
+/** XA PREPARE a transaction.
+@param[in,out]	trx	transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx)
+{
+	trx_start_if_not_started_xa(trx, false);
+
+	trx->op_info = "preparing";
+
+	trx_prepare(trx);
+
+	trx->op_info = "";
+}
+
+
+struct trx_recover_for_mysql_callback_arg
+{
+  XID *xid_list;
+  uint len;
+  uint count;
+};
+
+
+static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
+  trx_recover_for_mysql_callback_arg *arg)
+{
+  DBUG_ASSERT(arg->len > 0);
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    /*
+      The state of a read-write transaction can only change from ACTIVE to
+      PREPARED while we are holding the element->mutex. But since it is
+      executed at startup no state change should occur.
+    */
+    if (trx_state_eq(trx, TRX_STATE_PREPARED))
+    {
+      ut_ad(trx->is_recovered);
+      ut_ad(trx->id);
+      if (arg->count == 0)
+        ib::info() << "Starting recovery for XA transactions...";
+      XID& xid= arg->xid_list[arg->count];
+      if (arg->count++ < arg->len)
+      {
+        trx->state= TRX_STATE_PREPARED_RECOVERED;
+        ib::info() << "Transaction " << trx->id
+                   << " in prepared state after recovery";
+        ib::info() << "Transaction contains changes to " << trx->undo_no
+                   << " rows";
+        xid= trx->xid;
+      }
+    }
+  }
+  element->mutex.wr_unlock();
+  /* Do not terminate upon reaching arg->len; count all transactions */
+  return false;
+}
+
+
+static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
+  void*)
+{
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
+      trx->state= TRX_STATE_PREPARED;
+  }
+  element->mutex.wr_unlock();
+  return false;
+}
+
+
+/**
+  Find prepared transaction objects for recovery.
+
+  @param[out]  xid_list  prepared transactions
+  @param[in]   len       number of slots in xid_list
+
+  @return number of prepared transactions stored in xid_list
+*/
+
+int trx_recover_for_mysql(XID *xid_list, uint len)
+{
+  trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
+
+  ut_ad(xid_list);
+  ut_ad(len);
+
+  /* Fill xid_list with PREPARED transactions. */
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
+  if (arg.count)
+  {
+    ib::info() << arg.count
+        << " transactions in prepared state after recovery";
+    /* After returning the full list, reset the state, because
+    init_server_components() wants to recover the collection of
+    transactions twice, by first calling tc_log->open() and then
+    ha_recover() directly. */
+    if (arg.count <= len)
+      trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
+  }
+  return int(std::min(arg.count, len));
+}
+
+
+struct trx_get_trx_by_xid_callback_arg
+{
+  const XID *xid;
+  trx_t *trx;
+};
+
+
+static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
+  trx_get_trx_by_xid_callback_arg *arg)
+{
+  my_bool found= 0;
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    trx->mutex_lock();
+    if (trx->is_recovered &&
+	(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+	 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
+        arg->xid->eq(&trx->xid))
+    {
+#ifdef WITH_WSREP
+      /* The commit of a prepared recovered Galera
+      transaction needs a valid trx->xid for
+      invoking trx_sys_update_wsrep_checkpoint(). */
+      if (!wsrep_is_wsrep_xid(&trx->xid))
+#endif /* WITH_WSREP */
+      /* Invalidate the XID, so that subsequent calls will not find it. */
+      trx->xid.null();
+      arg->trx= trx;
+      found= 1;
+    }
+    trx->mutex_unlock();
+  }
+  element->mutex.wr_unlock();
+  return found;
+}
+
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in]	xid	X/Open XA transaction identifier
+@return	transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval	NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid)
+{
+  trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
+
+  if (xid)
+    trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
+  return arg.trx;
+}
+
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write)	/*!< in: true if read write transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, read_write);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		if (trx->id == 0 && read_write) {
+			/* If the transaction is tagged as read-only then
+			it can only write to temp tables and for such
+			transactions we don't want to move them to the
+			trx_sys_t::rw_trx_hash. */
+			if (!trx->read_only) {
+				trx_set_rw_mode(trx);
+			}
+		}
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*==========================*/
+	trx_t*	trx,		/*!< in: transaction */
+	bool	read_write)	/*!< in: true if read write transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, read_write);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		if (read_write && trx->id == 0 && !trx->read_only) {
+			trx_set_rw_mode(trx);
+		}
+		return;
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/**
+Start a transaction for internal processing.
+@param trx          transaction
+@param read_write   whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write)
+{
+  trx->will_lock= true;
+  trx_start_low(trx, read_write);
+}
+
+/** Start a transaction for a DDL operation.
+@param trx   transaction */
+void trx_start_for_ddl_low(trx_t *trx)
+{
+  /* Flag this transaction as a dictionary operation, so that
+  the data dictionary will be locked in crash recovery. */
+  trx->dict_operation= true;
+  trx_start_internal_low(trx, true);
+}
+
+/*************************************************************//**
+Set the transaction as a read-write transaction if it is not already
+tagged as such. Read-only transactions that are writing to temporary
+tables are assigned an ID and a rollback segment but are not added
+to the trx read-write list because their updates should not be visible
+to other transactions and therefore their changes can be ignored by
+by MVCC. */
+void
+trx_set_rw_mode(
+/*============*/
+	trx_t*		trx)		/*!< in/out: transaction that is RW */
+{
+	ut_ad(trx->rsegs.m_redo.rseg == 0);
+	ut_ad(!trx->is_autocommit_non_locking());
+	ut_ad(!trx->read_only);
+	ut_ad(trx->id == 0);
+
+	if (high_level_read_only) {
+		return;
+	}
+
+	trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+	ut_ad(trx->rsegs.m_redo.rseg != 0);
+
+	trx_sys.register_rw(trx);
+	ut_ad(trx->id);
+
+	/* So that we can see our own changes. */
+	if (trx->read_view.is_open()) {
+		trx->read_view.set_creator_trx_id(trx->id);
+	}
+}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000..cd21ebe1
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,1581 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "log.h"
+
+/* How should the old versions in the history list be managed?
+   ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+	However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+	A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+	When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+	In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+	We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+   -------------------------------------------------------------------
+latches?
+-------
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
+	When the transaction performs modifications or rolls back, its
+undo log is protected by undo page latches.
+Only the thread that is associated with the transaction may hold multiple
+undo page latches at a time. Undo pages are always private to a single
+transaction. Other threads that are performing MVCC reads
+or checking for implicit locks will lock at most one undo page at a time
+in trx_undo_get_undo_rec_low().
+	When the transaction commits, its persistent undo log is added
+to the history list. If it is not suitable for reuse, its slot is reset.
+In both cases, an x-latch must be acquired on the rollback segment header page.
+	The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset);/*!< in: undo log header byte offset on page */
+
+/** Determine the start offset of undo log records of an undo log page.
+@param[in]	block	undo log page
+@param[in]	page_no		undo log header page number
+@param[in]	offset		undo log header offset
+@return start offset */
+static
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+                                 uint16_t offset)
+{
+  return page_no == block->page.id().page_no()
+    ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->page.frame)
+    : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+}
+
+/** Get the first undo log record on a page.
+@param[in]	block	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to first record
+@retval	NULL	if none exists */
+static trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+                            uint16_t offset)
+{
+  uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+  return start == trx_undo_page_get_end(block, page_no, offset)
+    ? nullptr : block->page.frame + start;
+}
+
+/** Get the last undo log record on a page.
+@param[in]	page	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to last record
+@retval	NULL	if none exists */
+static
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+                           uint16_t offset)
+{
+  uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+  return trx_undo_page_get_start(block, page_no, offset) == end
+    ? nullptr
+    : block->page.frame + mach_read_from_2(block->page.frame + end - 2);
+}
+
+/** Get the previous record in an undo log from the previous page.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+                                     uint32_t page_no, uint16_t offset,
+                                     bool shared, mtr_t *mtr)
+{
+  uint32_t prev_page_no= mach_read_from_4(TRX_UNDO_PAGE_HDR +
+                                          TRX_UNDO_PAGE_NODE +
+                                          FLST_PREV + FIL_ADDR_PAGE +
+                                          block->page.frame);
+
+  if (prev_page_no == FIL_NULL)
+    return nullptr;
+
+  block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+                      0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+
+  return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr;
+}
+
+/** Get the previous undo log record.
+@param[in]	block	undo log page
+@param[in]	rec	undo log record
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to record
+@retval	NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+                           uint32_t page_no, uint16_t offset)
+{
+  ut_ad(block->page.frame == page_align(rec));
+  return
+    rec == block->page.frame + trx_undo_page_get_start(block, page_no, offset)
+    ? nullptr
+    : block->page.frame + mach_read_from_2(rec - 2);
+}
+
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr)
+{
+  if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+                                                       block->page.frame + rec,
+                                                       page_no, offset))
+    return prev;
+
+  /* We have to go to the previous undo log page to look for the
+  previous record */
+
+  return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+                                              shared, mtr);
+}
+
+/** Get the next record in an undo log from the next page.
+@param[in,out]  block   undo log page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(const buf_block_t *&block,
+                                     uint32_t page_no, uint16_t offset,
+                                     ulint mode, mtr_t *mtr)
+{
+  if (page_no == block->page.id().page_no() &&
+      mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG))
+    return nullptr;
+
+  uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                  FLST_NEXT + FIL_ADDR_PAGE +
+                                  block->page.frame);
+  if (next == FIL_NULL)
+    return nullptr;
+
+  block= buf_page_get_gen(page_id_t(block->page.id().space(), next), 0, mode,
+                          nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+  return block ? trx_undo_page_get_first_rec(block, page_no, offset) : nullptr;
+}
+
+/** Get the next record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_next_rec(const buf_block_t *&block, uint16_t rec,
+                      uint32_t page_no, uint16_t offset, mtr_t *mtr)
+{
+  if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no,
+                                                       offset))
+    return next;
+
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset,
+                                              RW_S_LATCH, mtr);
+}
+
+/** Get the first record in an undo log.
+@param[in]      space   undo log header space
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out]     block   undo log page
+@param[in,out]  mtr     mini-transaction
+@param[out]     err     error code
+@return undo log record, the page latched
+@retval nullptr if none */
+trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+                       uint16_t offset, ulint mode, const buf_block_t*& block,
+                       mtr_t *mtr, dberr_t *err)
+{
+  block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
+                          nullptr, BUF_GET, mtr, err);
+  if (!block)
+    return nullptr;
+
+  if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+    return rec;
+
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+                                              mtr);
+}
+
+inline void UndorecApplier::assign_rec(const buf_block_t &block,
+                                       uint16_t offset)
+{
+  ut_ad(block.page.lock.have_s());
+  this->offset= offset;
+  this->undo_rec= trx_undo_rec_copy(block.page.frame + offset, heap);
+}
+
+inline void UndorecApplier::apply_undo_rec()
+{
+  if (!undo_rec)
+    return;
+  bool updated_extern= false;
+  undo_no_t undo_no= 0;
+  table_id_t table_id= 0;
+  undo_rec= trx_undo_rec_get_pars(undo_rec, &type,
+                                  &cmpl_info,
+                                  &updated_extern, &undo_no, &table_id);
+  dict_sys.freeze(SRW_LOCK_CALL);
+  dict_table_t *table= dict_sys.find_table(table_id);
+  dict_sys.unfreeze();
+
+  ut_ad(table);
+  if (!table->is_active_ddl())
+    return;
+
+  dict_index_t *index= dict_table_get_first_index(table);
+  const dtuple_t *undo_tuple;
+  switch (type) {
+  default:
+    ut_ad("invalid type" == 0);
+    MY_ASSERT_UNREACHABLE();
+  case TRX_UNDO_INSERT_REC:
+    undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+  insert:
+    log_insert(*undo_tuple, index);
+    break;
+  case TRX_UNDO_UPD_EXIST_REC:
+  case TRX_UNDO_UPD_DEL_REC:
+  case TRX_UNDO_DEL_MARK_REC:
+    trx_id_t trx_id;
+    roll_ptr_t roll_ptr;
+    byte info_bits;
+    undo_rec= trx_undo_update_rec_get_sys_cols(
+      undo_rec, &trx_id, &roll_ptr, &info_bits);
+
+    undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+    undo_rec= trx_undo_update_rec_get_update(undo_rec, index, type, trx_id,
+                                             roll_ptr, info_bits,
+                                             heap, &update);
+    if (type == TRX_UNDO_UPD_DEL_REC)
+      goto insert;
+    log_update(*undo_tuple, index);
+  }
+
+  clear_undo_rec();
+}
+
+/** Apply any changes to tables for which online DDL is in progress. */
+ATTRIBUTE_COLD void trx_t::apply_log()
+{
+  const trx_undo_t *undo= rsegs.m_redo.undo;
+  if (!undo || !undo_no)
+    return;
+  page_id_t page_id{rsegs.m_redo.rseg->space->id, undo->hdr_page_no};
+  page_id_t next_page_id(page_id);
+  mtr_t mtr;
+  mtr.start();
+  buf_block_t *block= buf_page_get(page_id, 0, RW_S_LATCH, &mtr);
+  if (UNIV_UNLIKELY(!block))
+  {
+    mtr.commit();
+    return;
+  }
+
+  UndorecApplier log_applier(page_id, id);
+
+  for (;;)
+  {
+    trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_id.page_no(),
+                                                     undo->hdr_offset);
+    while (rec)
+    {
+      log_applier.assign_rec(*block, page_offset(rec));
+      mtr.commit();
+      log_applier.apply_undo_rec();
+      mtr.start();
+      block= buf_page_get(log_applier.get_page_id(), 0, RW_S_LATCH, &mtr);
+      if (UNIV_UNLIKELY(!block))
+        goto func_exit;
+      rec= trx_undo_page_get_next_rec(block, log_applier.get_offset(),
+                                      page_id.page_no(), undo->hdr_offset);
+    }
+
+    uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                    FLST_NEXT + FIL_ADDR_PAGE +
+                                    block->page.frame);
+    if (next == FIL_NULL)
+      break;
+    next_page_id.set_page_no(next);
+    mtr.commit();
+    mtr.start();
+    block= buf_page_get_gen(next_page_id, 0, RW_S_LATCH, block, BUF_GET, &mtr);
+    if (UNIV_UNLIKELY(!block))
+      break;
+    log_applier.assign_next(next_page_id);
+  }
+func_exit:
+  mtr.commit();
+  apply_online_log= false;
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param block   undo log page */
+void trx_undo_page_init(const buf_block_t &block)
+{
+  mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.page.frame),
+                  FIL_PAGE_UNDO_LOG);
+  static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.page.frame,
+                    0, 2);
+  mach_write_to_2(my_assume_aligned<2>
+                  (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame),
+                  TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame,
+                    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame,
+                    2);
+  /* The following corresponds to flst_zero_both(), but without writing log. */
+  memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_BYTE + block.page.frame, 0, 2);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_BYTE + block.page.frame, 0, 2);
+  static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+                TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+  /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+  memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                    block.page.frame, 0,
+                    srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                                     FIL_PAGE_DATA_END));
+}
+
+/** Look for a free slot for an undo log segment.
+@param rseg_header   rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
+{
+  ulint max_slots= TRX_RSEG_N_SLOTS;
+
+#ifdef UNIV_DEBUG
+  if (trx_rseg_n_slots_debug)
+    max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
+
+  for (ulint i= 0; i < max_slots; i++)
+    if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+      return i;
+
+  return ULINT_UNDEFINED;
+}
+
+/** Create an undo log segment.
+@param[in,out]	space		tablespace
+@param[in,out]	rseg_hdr	rollback segment header (x-latched)
+@param[out]	id		undo slot number
+@param[out]	err		error code
+@param[in,out]	mtr		mini-transaction
+@return	undo log block
+@retval	NULL	on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+                    dberr_t *err, mtr_t *mtr)
+{
+	buf_block_t*	block;
+	uint32_t	n_reserved;
+
+	const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+
+	if (slot_no == ULINT_UNDEFINED) {
+		ib::warn() << "Cannot find a free slot for an undo log. Do"
+			" you have too many active transactions running"
+			" concurrently?";
+
+		*err = DB_TOO_MANY_CONCURRENT_TRXS;
+		return NULL;
+	}
+
+	ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
+	*err = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+					   mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return NULL;
+	}
+
+	/* Allocate a new file segment for the undo log */
+	block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+			    mtr, err, true);
+
+	space->release_free_extents(n_reserved);
+
+	if (!block) {
+		return block;
+	}
+
+	mtr->undo_create(*block);
+	trx_undo_page_init(*block);
+
+	mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		      + block->page.frame,
+		      TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+				       TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+				       + block->page.frame, 0U);
+
+	flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+		  + block->page.frame, mtr);
+
+	*err = flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+			     block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+			     mtr);
+
+	*id = slot_no;
+	mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+		      + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->page.frame,
+		      block->page.id().page_no());
+
+	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
+	*err = DB_SUCCESS;
+	return block;
+}
+
+/** Initialize an undo log header.
+@param[in,out]  undo_page   undo log segment header page
+@param[in]      trx_id      transaction identifier
+@param[in,out]  mtr         mini-transaction
+@return header byte offset on page */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+                                       mtr_t* mtr)
+{
+  /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+  repurposed after upgrading to MariaDB 10.3. */
+  byte *undo_type= my_assume_aligned<2>
+    (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->page.frame);
+  ut_ad(mach_read_from_2(undo_type) <= 2);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+  byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+                                    undo_page->page.frame);
+  const uint16_t free= mach_read_from_2(start + 2);
+  static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+                "compatibility");
+  ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+  mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+  /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+  So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+  But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+  memcpy_aligned<2>(start + 2, start, 2);
+  mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+              start, 2);
+  uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                                      undo_page->page.frame);
+  alignas(4) byte buf[4];
+  mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+  mach_write_to_2(buf + 2, free);
+  static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+  static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+  mtr->memcpy(*undo_page, my_assume_aligned<4>
+              (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->page.frame),
+              buf, 4);
+  if (prev_log)
+    mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG +
+                  undo_page->page.frame, free);
+  mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+                                 undo_page->page.frame, trx_id);
+  if (UNIV_UNLIKELY(mach_read_from_8(free + TRX_UNDO_TRX_NO +
+                                     undo_page->page.frame) != 0))
+    mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0);
+
+  /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+  mach_write_to_2(buf, 1);
+  memcpy_aligned<2>(buf + 2, start, 2);
+  static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+                "compatibility");
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+                                undo_page->page.frame, buf, 4);
+  /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+  if (prev_log)
+  {
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+    mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+                                   undo_page->page.frame, prev_log);
+    static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+                  "compatibility");
+    mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+    static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+                  FLST_NODE_SIZE, "compatibility");
+  }
+  else
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+  return free;
+}
+
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out]  block   undo header page
+@param[in]      offset  undo header record offset
+@param[in]      xid     distributed transaction identifier
+@param[in,out]  mtr     mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+                               const XID &xid, mtr_t *mtr)
+{
+  DBUG_ASSERT(xid.gtrid_length > 0);
+  DBUG_ASSERT(xid.bqual_length >= 0);
+  DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+  DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+  static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+                "gtrid and bqual don't fit xid data");
+  DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                               block->page.frame) == offset);
+
+  trx_ulogf_t* log_hdr= block->page.frame + offset;
+
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+                                 static_cast<uint32_t>(xid.formatID));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+                                 static_cast<uint32_t>(xid.gtrid_length));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+                                 static_cast<uint32_t>(xid.bqual_length));
+  const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+                                             + xid.bqual_length);
+  mtr->memcpy(*block, &block->page.frame[offset + TRX_UNDO_XA_XID],
+              xid.data, xid_length);
+  if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+    mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+                XIDDATASIZE - xid_length, 0);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
+{
+	xid->formatID=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_FORMAT));
+
+	xid->gtrid_length=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_TRID_LEN));
+
+	xid->bqual_length=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_BQUAL_LEN));
+
+	memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/** Allocate an undo log page.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any page latch
+@param[out]	err	error code
+@return	X-latched block if success
+@retval	nullptr	on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+{
+  buf_block_t *new_block= nullptr;
+  uint32_t n_reserved;
+
+  /* When we add a page to an undo log, this is analogous to
+   a pessimistic insert in a B-tree, and we must reserve the
+   counterpart of the tree latch, which is the rseg mutex. */
+
+  trx_rseg_t *rseg= undo->rseg;
+  rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+  buf_block_t *header_block=
+    buf_page_get_gen(page_id_t{rseg->space->id, undo->hdr_page_no},
+                     0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+  if (!header_block)
+    goto func_exit;
+  *err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr);
+
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    goto func_exit;
+
+  new_block=
+    fseg_alloc_free_page_general(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                                 header_block->page.frame,
+                                 undo->top_page_no + 1, FSP_UP, true,
+                                 mtr, mtr, err);
+  rseg->space->release_free_extents(n_reserved);
+
+  if (!new_block)
+    goto func_exit;
+
+  undo->last_page_no= new_block->page.id().page_no();
+
+  mtr->undo_create(*new_block);
+  trx_undo_page_init(*new_block);
+  *err= flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+                      new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    new_block= nullptr;
+  else
+  {
+    undo->size++;
+    rseg->curr_size++;
+  }
+
+func_exit:
+  rseg->latch.wr_unlock();
+  return new_block;
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+uint32_t
+trx_undo_free_page(
+/*===============*/
+	trx_rseg_t* rseg,	/*!< in: rollback segment */
+	bool	in_history,	/*!< in: TRUE if the undo log is in the history
+				list */
+	uint32_t hdr_page_no,	/*!< in: header page number */
+	uint32_t page_no,	/*!< in: page number to free: must not be the
+				header page */
+	mtr_t*	mtr,		/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+	dberr_t* err)		/*!< out: error code */
+{
+	ut_a(hdr_page_no != page_no);
+
+	buf_block_t* undo_block = buf_page_get_gen(page_id_t(rseg->space->id,
+							     page_no),
+						   0, RW_X_LATCH, nullptr,
+						   BUF_GET, mtr, err);
+	if (UNIV_UNLIKELY(!undo_block)) {
+		return FIL_NULL;
+	}
+	buf_block_t* header_block = buf_page_get_gen(page_id_t(rseg->space->id,
+							       hdr_page_no),
+						     0, RW_X_LATCH, nullptr,
+						     BUF_GET, mtr, err);
+	if (UNIV_UNLIKELY(!header_block)) {
+		return FIL_NULL;
+	}
+
+	*err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+			   undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+			   mtr);
+
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return FIL_NULL;
+	}
+
+	*err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+			      + header_block->page.frame,
+			      rseg->space, page_no, mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return FIL_NULL;
+	}
+	buf_page_free(rseg->space, page_no, mtr);
+
+	const fil_addr_t last_addr = flst_get_last(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+		+ header_block->page.frame);
+	rseg->curr_size--;
+
+	if (!in_history) {
+	} else if (buf_block_t* rseg_header = rseg->get(mtr, err)) {
+		byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			+ rseg_header->page.frame;
+		uint32_t hist_size = mach_read_from_4(rseg_hist_size);
+		ut_ad(hist_size > 0);
+		mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+	} else {
+		return FIL_NULL;
+	}
+
+	return(last_addr.page);
+}
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any undo log page
+			or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+{
+  ut_ad(undo->hdr_page_no != undo->last_page_no);
+  ut_ad(undo->size > 0);
+  undo->size--;
+
+  dberr_t err;
+  undo->last_page_no= trx_undo_free_page(undo->rseg, false, undo->hdr_page_no,
+                                         undo->last_page_no, mtr, &err);
+  return err;
+}
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out]	undo	undo log
+@param[in]	limit	all undo logs after this limit will be discarded
+@param[in]	is_temp	whether this is temporary undo log
+@return error code */
+static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit,
+                                     bool is_temp)
+{
+  ut_ad(is_temp == !undo.rseg->is_persistent());
+
+  for (mtr_t mtr;;)
+  {
+    mtr.start();
+    if (is_temp)
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+    trx_undo_rec_t *trunc_here= nullptr;
+    undo.rseg->latch.wr_lock(SRW_LOCK_CALL);
+    dberr_t err;
+    buf_block_t *undo_block=
+      buf_page_get_gen(page_id_t{undo.rseg->space->id, undo.last_page_no},
+                       0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
+    if (UNIV_UNLIKELY(!undo_block))
+      goto func_exit;
+
+    for (trx_undo_rec_t *rec=
+           trx_undo_page_get_last_rec(undo_block,
+                                      undo.hdr_page_no, undo.hdr_offset);
+         rec; )
+    {
+      if (trx_undo_rec_get_undo_no(rec) < limit)
+        goto func_exit;
+      /* Truncate at least this record off, maybe more */
+      trunc_here= rec;
+      rec= trx_undo_page_get_prev_rec(undo_block, rec,
+                                      undo.hdr_page_no, undo.hdr_offset);
+    }
+
+    if (undo.last_page_no != undo.hdr_page_no)
+    {
+      err= trx_undo_free_last_page(&undo, &mtr);
+      if (UNIV_UNLIKELY(err != DB_SUCCESS))
+        goto func_exit;
+      undo.rseg->latch.wr_unlock();
+      mtr.commit();
+      continue;
+    }
+
+func_exit:
+    undo.rseg->latch.wr_unlock();
+
+    if (trunc_here && err == DB_SUCCESS)
+      mtr.write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                   undo_block->page.frame,
+                   ulint(trunc_here - undo_block->page.frame));
+
+    mtr.commit();
+    return err;
+  }
+}
+
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx)
+{
+  if (trx_undo_t *undo= trx.rsegs.m_redo.undo)
+  {
+    ut_ad(undo->rseg == trx.rsegs.m_redo.rseg);
+    if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, false))
+      return err;
+  }
+
+  if (trx_undo_t *undo = trx.rsegs.m_noredo.undo)
+  {
+    ut_ad(undo->rseg == trx.rsegs.m_noredo.rseg);
+    if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, true))
+      return err;
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_page_no	header page number
+@param[in]	hdr_offset	header offset on the page
+@param[in]	limit		first undo number to preserve
+(everything below the limit will be truncated)
+@return error code  */
+dberr_t
+trx_undo_truncate_start(
+	trx_rseg_t*	rseg,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
+	undo_no_t	limit)
+{
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* last_rec;
+	mtr_t		mtr;
+
+	if (!limit) {
+		return DB_SUCCESS;
+	}
+loop:
+	mtr_start(&mtr);
+
+	if (!rseg->is_persistent()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	}
+
+	dberr_t err;
+	const buf_block_t* undo_page;
+	rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+				     RW_X_LATCH, undo_page, &mtr, &err);
+	if (rec == NULL) {
+		/* Already empty */
+done:
+		mtr.commit();
+		return err;
+	}
+
+	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+					      hdr_offset);
+	if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+		goto done;
+	}
+
+	if (undo_page->page.id().page_no() == hdr_page_no) {
+		uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
+						+ undo_page->page.frame);
+		if (end == 0) {
+			end = mach_read_from_2(TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_FREE
+					       + undo_page->page.frame);
+		}
+
+		mtr.write<2>(*undo_page, undo_page->page.frame + hdr_offset
+			     + TRX_UNDO_LOG_START, end);
+	} else {
+		trx_undo_free_page(rseg, true, hdr_page_no,
+				   undo_page->page.id().page_no(), &mtr, &err);
+		if (err != DB_SUCCESS) {
+			goto done;
+		}
+	}
+
+	mtr.commit();
+	goto loop;
+}
+
+/** Frees an undo log segment which is not in the history list.
+@param undo	temporary undo log */
+static void trx_undo_seg_free(const trx_undo_t *undo)
+{
+  ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+  trx_rseg_t *const rseg= undo->rseg;
+  bool finished;
+  mtr_t mtr;
+  ut_ad(rseg->space == fil_system.temp_space);
+
+  do
+  {
+    mtr.start();
+    mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+    finished= true;
+
+    if (buf_block_t *block=
+        buf_page_get(page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), 0,
+                     RW_X_LATCH, &mtr))
+    {
+      fseg_header_t *file_seg= TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+        block->page.frame;
+
+      finished= fseg_free_step(file_seg, &mtr);
+
+      if (!finished);
+      else if (buf_block_t* rseg_header = rseg->get(&mtr, nullptr))
+      {
+        static_assert(FIL_NULL == 0xffffffff, "compatibility");
+        mtr.memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                   undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+        MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+      }
+    }
+
+    mtr.commit();
+  }
+  while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/** Read an undo log when starting up the database.
+@param[in,out]	rseg		rollback segment
+@param[in]	id		rollback segment slot
+@param[in]	page_no		undo log segment page number
+@param[in,out]	max_trx_id	the largest observed transaction ID
+@return	the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no,
+                                trx_id_t &max_trx_id)
+{
+	mtr_t		mtr;
+	XID		xid;
+
+	ut_ad(id < TRX_RSEG_N_SLOTS);
+
+	mtr.start();
+	const buf_block_t* block = buf_page_get(
+		page_id_t(rseg->space->id, page_no), 0, RW_X_LATCH, &mtr);
+	if (UNIV_UNLIKELY(!block)) {
+corrupted:
+		mtr.commit();
+		return nullptr;
+	}
+
+	const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_TYPE
+					       + block->page.frame);
+	if (UNIV_UNLIKELY(type > 2)) {
+corrupted_type:
+		sql_print_error("InnoDB: unsupported undo header type %u",
+				type);
+		goto corrupted;
+	}
+
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->page.frame);
+	if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
+	    offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+		sql_print_error("InnoDB: invalid undo header offset %u",
+				offset);
+		goto corrupted;
+	}
+
+	const trx_ulogf_t* const undo_header = block->page.frame + offset;
+	uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+					  + block->page.frame);
+	switch (state) {
+	case TRX_UNDO_ACTIVE:
+	case TRX_UNDO_PREPARED:
+		if (UNIV_LIKELY(type != 1)) {
+			break;
+		}
+		sql_print_error("InnoDB: upgrade from older version than"
+				" MariaDB 10.3 requires clean shutdown");
+		goto corrupted;
+	default:
+		sql_print_error("InnoDB: unsupported undo header state %u",
+				state);
+		goto corrupted;
+	case TRX_UNDO_CACHED:
+		if (UNIV_UNLIKELY(type != 0)) {
+			/* This undo page was not updated by MariaDB
+			10.3 or later. The TRX_UNDO_TRX_NO field may
+			contain garbage. */
+			break;
+		}
+		goto read_trx_no;
+	case TRX_UNDO_TO_PURGE:
+		if (UNIV_UNLIKELY(type == 1)) {
+			goto corrupted_type;
+		}
+	read_trx_no:
+		trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header);
+		if (id >> 48) {
+			sql_print_error("InnoDB: corrupted TRX_NO %llx", id);
+			goto corrupted;
+		}
+		if (id > max_trx_id) {
+			max_trx_id = id;
+		}
+	}
+
+	/* Read X/Open XA transaction identification if it exists, or
+	set it to NULL. */
+
+	if (undo_header[TRX_UNDO_XID_EXISTS]) {
+		trx_undo_read_xid(undo_header, &xid);
+	} else {
+		xid.null();
+	}
+
+	trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+	if (trx_id >> 48) {
+		sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
+		goto corrupted;
+	}
+	if (trx_id > max_trx_id) {
+		max_trx_id = trx_id;
+	}
+
+	trx_undo_t* undo = trx_undo_mem_create(
+		rseg, id, trx_id, &xid, page_no, offset);
+	if (!undo) {
+		return undo;
+	}
+
+	undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
+	undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+				  + block->page.frame);
+
+	fil_addr_t	last_addr = flst_get_last(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
+
+	undo->last_page_no = last_addr.page;
+	undo->top_page_no = last_addr.page;
+
+	const buf_block_t* last = buf_page_get(
+		page_id_t(rseg->space->id, undo->last_page_no), 0,
+		RW_X_LATCH, &mtr);
+
+	if (UNIV_UNLIKELY(!last)) {
+		ut_free(undo);
+		goto corrupted;
+        }
+
+	if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+		    last, page_no, offset)) {
+		undo->top_offset = static_cast<uint16_t>(
+			rec - last->page.frame);
+		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+		ut_ad(!undo->empty());
+	} else {
+		undo->top_undo_no = IB_ID_MAX;
+		ut_ad(undo->empty());
+	}
+
+	undo->state = state;
+
+	if (state != TRX_UNDO_CACHED) {
+		UT_LIST_ADD_LAST(rseg->undo_list, undo);
+	} else {
+		UT_LIST_ADD_LAST(rseg->undo_cached, undo);
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	}
+
+	mtr.commit();
+	return undo;
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification */
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
+{
+	trx_undo_t*	undo;
+
+	ut_a(id < TRX_RSEG_N_SLOTS);
+
+	undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
+
+	if (undo == NULL) {
+
+		return(NULL);
+	}
+
+	undo->id = id;
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->rseg = rseg;
+
+	undo->hdr_page_no = page_no;
+	undo->hdr_offset = offset;
+	undo->last_page_no = page_no;
+	undo->size = 1;
+
+	undo->top_undo_no = IB_ID_MAX;
+	undo->top_page_no = page_no;
+	undo->guess_block = NULL;
+	ut_ad(undo->empty());
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+	trx_undo_t*	undo,	/*!< in: undo log to init */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
+{
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->hdr_offset = offset;
+	undo->top_undo_no = IB_ID_MAX;
+	ut_ad(undo->empty());
+}
+
+/** Create an undo log.
+@param[in,out]	trx	transaction
+@param[in,out]	rseg	rollback segment
+@param[out]	undo	undo log object
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return undo log block
+@retval	NULL	on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+		dberr_t* err, mtr_t* mtr)
+{
+	ulint		id;
+	buf_block_t*	block = rseg->get(mtr, err);
+
+	if (block) {
+		block = trx_undo_seg_create(rseg->space, block, &id, err, mtr);
+	}
+
+	if (!block) {
+		return NULL;
+	}
+
+	rseg->curr_size++;
+
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+	*undo = trx_undo_mem_create(rseg, id, trx->id, &trx->xid,
+				    block->page.id().page_no(), offset);
+	if (*undo == NULL) {
+		*err = DB_OUT_OF_MEMORY;
+		 /* FIXME: this will not free the undo block to the file */
+		return NULL;
+	} else if (rseg != trx->rsegs.m_redo.rseg) {
+		return block;
+	}
+
+	if (trx->dict_operation) {
+		(*undo)->dict_operation = true;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_TABLE_ID, 0U);
+	}
+
+	*err = DB_SUCCESS;
+	return block;
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/** Reuse a cached undo log block.
+@param[in,out]	trx	transaction
+@param[in,out]	rseg	rollback segment
+@param[out]	pundo	the undo log memory object
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	if none cached */
+static
+buf_block_t*
+trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
+		      mtr_t* mtr)
+{
+	trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+	if (!undo) {
+		return NULL;
+	}
+
+	ut_ad(undo->size == 1);
+	ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t*	block = buf_page_get(page_id_t(undo->rseg->space->id,
+						       undo->hdr_page_no),
+					     0, RW_X_LATCH, mtr);
+	if (!block) {
+		return NULL;
+	}
+
+	UT_LIST_REMOVE(rseg->undo_cached, undo);
+	MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+	*pundo = undo;
+
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+	trx_undo_mem_init_for_reuse(undo, trx->id, &trx->xid, offset);
+
+	if (rseg != trx->rsegs.m_redo.rseg) {
+		return block;
+	}
+
+	if (trx->dict_operation) {
+		undo->dict_operation = TRUE;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_TABLE_ID, 0U);
+	}
+
+	return block;
+}
+
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+{
+	ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+	trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+	if (undo) {
+		return buf_page_get_gen(
+			page_id_t(undo->rseg->space->id, undo->last_page_no),
+			0, RW_X_LATCH, undo->guess_block,
+			BUF_GET, mtr, err);
+	}
+
+	trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
+	buf_block_t* block = trx_undo_reuse_cached(
+		trx, rseg, &trx->rsegs.m_redo.undo, mtr);
+
+	if (!block) {
+		block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo,
+					err, mtr);
+		ut_ad(!block == (*err != DB_SUCCESS));
+		if (!block) {
+			goto func_exit;
+		}
+	} else {
+		*err = DB_SUCCESS;
+	}
+
+	UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
+
+func_exit:
+	rseg->latch.wr_unlock();
+	return block;
+}
+
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[in]	rseg	rollback segment
+@param[out]	undo	the undo log
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+		    dberr_t* err, mtr_t* mtr)
+{
+	ut_d(const bool	is_temp = rseg == trx->rsegs.m_noredo.rseg);
+	ut_ad(rseg == trx->rsegs.m_redo.rseg
+	      || rseg == trx->rsegs.m_noredo.rseg);
+	ut_ad(undo == (is_temp
+		       ? &trx->rsegs.m_noredo.undo
+		       : &trx->rsegs.m_redo.undo));
+	ut_ad(mtr->get_log_mode()
+	      == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+
+	if (*undo) {
+		return buf_page_get_gen(
+			page_id_t(rseg->space->id, (*undo)->last_page_no),
+			0, RW_X_LATCH, (*undo)->guess_block,
+			BUF_GET, mtr, err);
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_too_many_trx",
+		*err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
+	);
+
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+	buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr);
+
+	if (!block) {
+		block = trx_undo_create(trx, rseg, undo, err, mtr);
+		ut_ad(!block == (*err != DB_SUCCESS));
+		if (!block) {
+			goto func_exit;
+		}
+	} else {
+		*err = DB_SUCCESS;
+	}
+
+	UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
+
+func_exit:
+	rseg->latch.wr_unlock();
+	return block;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+buf_block_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+  buf_block_t *block=
+    buf_page_get(page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0,
+                 RW_X_LATCH, mtr);
+  /* This function is invoked during transaction commit, which is not
+  allowed to fail. If we get a corrupted undo header, we will crash here. */
+  ut_a(block);
+  const uint16_t state = undo->size == 1 &&
+    TRX_UNDO_PAGE_REUSE_LIMIT >
+    mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                     block->page.frame)
+    ? TRX_UNDO_CACHED
+    : TRX_UNDO_TO_PURGE;
+
+  undo->state= state;
+  mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->page.frame,
+                state);
+  return block;
+}
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in]	rollback	false=XA PREPARE, true=XA ROLLBACK
+@param[in,out]	mtr		mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+				   mtr_t *mtr)
+{
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t* block = buf_page_get(
+		page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0,
+		RW_X_LATCH, mtr);
+	if (UNIV_UNLIKELY(!block)) {
+		/* In case of !rollback the undo header page
+		corruption would leave the transaction object in an
+		unexpected (active) state. */
+		ut_a(rollback);
+		return;
+	}
+
+	if (rollback) {
+		ut_ad(undo->state == TRX_UNDO_PREPARED);
+		mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+			      + block->page.frame, TRX_UNDO_ACTIVE);
+		return;
+	}
+
+	/*------------------------------*/
+	ut_ad(undo->state == TRX_UNDO_ACTIVE);
+	undo->state = TRX_UNDO_PREPARED;
+	undo->xid   = trx->xid;
+	/*------------------------------*/
+
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+		      + block->page.frame, undo->state);
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->page.frame);
+	mtr->write<1>(*block, block->page.frame + offset + TRX_UNDO_XID_EXISTS,
+		      1U);
+
+	trx_undo_write_xid(block, offset, undo->xid, mtr);
+}
+
+/** Free temporary undo log after commit or rollback.
+The information is not needed after a commit or rollback, therefore
+the data can be discarded.
+@param undo     temporary undo log */
+void trx_undo_commit_cleanup(trx_undo_t *undo)
+{
+	trx_rseg_t*	rseg	= undo->rseg;
+	ut_ad(rseg->space == fil_system.temp_space);
+
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+	UT_LIST_REMOVE(rseg->undo_list, undo);
+
+	if (undo->state == TRX_UNDO_CACHED) {
+		UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+		undo = nullptr;
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+
+		/* Delete first the undo log segment in the file */
+		trx_undo_seg_free(undo);
+
+		ut_ad(rseg->curr_size > undo->size);
+		rseg->curr_size -= undo->size;
+	}
+
+	rseg->latch.wr_unlock();
+	ut_free(undo);
+}
+
+/** At shutdown, frees the undo logs of a transaction. */
+void trx_undo_free_at_shutdown(trx_t *trx)
+{
+	if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
+		switch (undo->state) {
+		case TRX_UNDO_PREPARED:
+			break;
+		case TRX_UNDO_CACHED:
+		case TRX_UNDO_TO_PURGE:
+			ut_ad(trx_state_eq(trx,
+					   TRX_STATE_COMMITTED_IN_MEMORY));
+			/* fall through */
+		case TRX_UNDO_ACTIVE:
+			/* trx_t::commit_state() assigns
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+			ut_a(!srv_was_started
+			     || srv_read_only_mode
+			     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+			     || srv_fast_shutdown);
+			break;
+		default:
+			ut_error;
+		}
+
+		UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo);
+		ut_free(undo);
+		undo = NULL;
+	}
+	if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
+		ut_a(undo->state == TRX_UNDO_PREPARED);
+
+		UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo);
+		ut_free(undo);
+		undo = NULL;
+	}
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:04:16 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 18:04:16 +0000
commit	a68fb2d8219f6bccc573009600e9f23e89226a5e (patch)
tree	d742d35d14ae816e99293d2b01face30e9f3a46b /storage/innobase/trx
parent	Initial commit. (diff)
download	mariadb-10.6-a68fb2d8219f6bccc573009600e9f23e89226a5e.tar.xz mariadb-10.6-a68fb2d8219f6bccc573009600e9f23e89226a5e.zip