8 files changed, 11138 insertions, 0 deletions
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000..d043c3d8
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1490 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#include "trx0i_s.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "trx0sys.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "sql_class.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM	1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE	39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd			\
+	 - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+	ulint	offset;		/*!< offset, in number of rows */
+	ulint	rows_allocd;	/*!< the size of this chunk, in number
+				of rows */
+	void*	base;		/*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+	ulint		rows_used;	/*!< number of used rows */
+	ulint		rows_allocd;	/*!< number of allocated rows */
+	ulint		row_size;	/*!< size of a single row */
+	i_s_mem_chunk_t	chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+					memory chunks that stores the
+					rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+	rw_lock_t	rw_lock;	/*!< read-write lock protecting
+					the rest of this structure */
+	Atomic_relaxed<ulonglong> last_read;
+					/*!< last time the cache was read;
+					measured in nanoseconds */
+	i_s_table_cache_t innodb_trx;	/*!< innodb_trx table */
+	i_s_table_cache_t innodb_locks;	/*!< innodb_locks table */
+	i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM		10000
+	hash_table_t	locks_hash;	/*!< hash table used to eliminate
+					duplicate entries in the
+					innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE	1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS	2048
+	ha_storage_t*	storage;	/*!< storage for external volatile
+					data that may become unavailable
+					when we release
+					lock_sys.mutex */
+	ulint		mem_allocd;	/*!< the amount of memory
+					allocated with mem_alloc*() */
+	bool		is_truncated;	/*!< this is true if the memory
+					limit was hit and thus the data
+					in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t	trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
+
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
+{
+  return lock_get_type(lock) == LOCK_REC
+    ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+    : uint16_t{0xFFFF};
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+	i_s_table_cache_t*	table_cache,	/*!< out: table cache */
+	size_t			row_size)	/*!< in: the size of a
+						row */
+{
+	ulint	i;
+
+	table_cache->rows_used = 0;
+	table_cache->rows_allocd = 0;
+	table_cache->row_size = row_size;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		table_cache->chunks[i].base = NULL;
+	}
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+	i_s_table_cache_t*	table_cache)	/*!< in/out: table cache */
+{
+	ulint	i;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		if (table_cache->chunks[i].base) {
+			ut_free(table_cache->chunks[i].base);
+			table_cache->chunks[i].base = NULL;
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+	i_s_table_cache_t*	table_cache,	/*!< in/out: table cache */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache to record
+						how many bytes are
+						allocated */
+{
+	ulint	i;
+	void*	row;
+
+	ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+	if (table_cache->rows_used == table_cache->rows_allocd) {
+
+		/* rows_used == rows_allocd means that new chunk needs
+		to be allocated: either no more empty rows in the
+		last allocated chunk or nothing has been allocated yet
+		(rows_num == rows_allocd == 0); */
+
+		i_s_mem_chunk_t*	chunk;
+		ulint			req_bytes;
+		ulint			got_bytes;
+		ulint			req_rows;
+		ulint			got_rows;
+
+		/* find the first not allocated chunk */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].base == NULL) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		have been allocated :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		/* allocate the chunk we just found */
+
+		if (i == 0) {
+
+			/* first chunk, nothing is allocated yet */
+			req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+		} else {
+
+			/* Memory is increased by the formula
+			new = old + old / 2; We are trying not to be
+			aggressive here (= using the common new = old * 2)
+			because the allocated memory will not be freed
+			until InnoDB exit (it is reused). So it is better
+			to once allocate the memory in more steps, but
+			have less unused/wasted memory than to use less
+			steps in allocation (which is done once in a
+			lifetime) but end up with lots of unused/wasted
+			memory. */
+			req_rows = table_cache->rows_allocd / 2;
+		}
+		req_bytes = req_rows * table_cache->row_size;
+
+		if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+			return(NULL);
+		}
+
+		chunk = &table_cache->chunks[i];
+
+		got_bytes = req_bytes;
+		chunk->base = ut_malloc_nokey(req_bytes);
+
+		got_rows = got_bytes / table_cache->row_size;
+
+		cache->mem_allocd += got_bytes;
+
+#if 0
+		printf("allocating chunk %d req bytes=%lu, got bytes=%lu,"
+		       " row size=%lu,"
+		       " req rows=%lu, got rows=%lu\n",
+		       i, req_bytes, got_bytes,
+		       table_cache->row_size,
+		       req_rows, got_rows);
+#endif
+
+		chunk->rows_allocd = got_rows;
+
+		table_cache->rows_allocd += got_rows;
+
+		/* adjust the offset of the next chunk */
+		if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+			table_cache->chunks[i + 1].offset
+				= chunk->offset + chunk->rows_allocd;
+		}
+
+		/* return the first empty row in the newly allocated
+		chunk */
+		row = chunk->base;
+	} else {
+
+		char*	chunk_start;
+		ulint	offset;
+
+		/* there is an empty row, no need to allocate new
+		chunks */
+
+		/* find the first chunk that contains allocated but
+		empty/unused rows */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].offset
+			    + table_cache->chunks[i].rows_allocd
+			    > table_cache->rows_used) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		are full, but
+		table_cache->rows_used != table_cache->rows_allocd means
+		exactly the opposite - there are allocated but
+		empty/unused rows :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		chunk_start = (char*) table_cache->chunks[i].base;
+		offset = table_cache->rows_used
+			- table_cache->chunks[i].offset;
+
+		row = chunk_start + offset * table_cache->row_size;
+	}
+
+	table_cache->rows_used++;
+
+	return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+	const i_s_locks_row_t*	row)	/*!< in: row to validate */
+{
+	ut_ad(row->lock_mode);
+	ut_ad(row->lock_table != NULL);
+	ut_ad(row->lock_table_id != 0);
+
+	if (!row->lock_index) {
+		/* table lock */
+		ut_ad(!row->lock_data);
+		ut_ad(row->lock_page == page_id_t(0, 0));
+		ut_ad(!row->lock_rec);
+	} else {
+		/* record lock */
+		/* row->lock_data == NULL if buf_page_try_get() == NULL */
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+	i_s_trx_row_t*		row,		/*!< out: result object
+						that's filled */
+	const trx_t*		trx,		/*!< in: transaction to
+						get data from */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						corresponding row in
+						innodb_locks if trx is
+						waiting or NULL if trx
+						is not waiting */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache into
+						which to copy volatile
+						strings */
+{
+	const char*	s;
+
+	ut_ad(lock_mutex_own());
+
+	row->trx_id = trx_get_id_for_print(trx);
+	row->trx_started = trx->start_time;
+	row->trx_state = trx_get_que_state_str(trx);
+	row->requested_lock_row = requested_lock_row;
+	ut_ad(requested_lock_row == NULL
+	      || i_s_locks_row_validate(requested_lock_row));
+
+	if (trx->lock.wait_lock != NULL) {
+
+		ut_a(requested_lock_row != NULL);
+		row->trx_wait_started = trx->lock.wait_started;
+	} else {
+		ut_a(requested_lock_row == NULL);
+		row->trx_wait_started = 0;
+	}
+
+	row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
+
+	if (trx->mysql_thd == NULL) {
+		/* For internal transactions e.g., purge and transactions
+		being recovered at startup there is no associated MySQL
+		thread data structure. */
+		row->trx_mysql_thread_id = 0;
+		row->trx_query = NULL;
+		goto thd_done;
+	}
+
+	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+	char	query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+	if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query,
+					     sizeof query)) {
+		row->trx_query = static_cast<const char*>(
+			ha_storage_put_memlim(
+				cache->storage, query, stmt_len + 1,
+				MAX_ALLOWED_FOR_STORAGE(cache)));
+
+		row->trx_query_cs = thd_charset(trx->mysql_thd);
+
+		if (row->trx_query == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+
+		row->trx_query = NULL;
+	}
+
+thd_done:
+	row->trx_operation_state = trx->op_info;
+
+	row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+	row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
+
+	/* These are protected by both trx->mutex or lock_sys.mutex,
+	or just lock_sys.mutex. For reading, it suffices to hold
+	lock_sys.mutex. */
+
+	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+	row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
+
+	row->trx_rows_modified = trx->undo_no;
+
+	row->trx_isolation_level = trx->isolation_level;
+
+	row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+	row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+	s = trx->detailed_error;
+
+	if (s != NULL && s[0] != '\0') {
+
+		TRX_I_S_STRING_COPY(s,
+				    row->trx_foreign_key_error,
+				    TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+		if (row->trx_foreign_key_error == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+		row->trx_foreign_key_error = NULL;
+	}
+
+	row->trx_is_read_only = trx->read_only;
+
+	row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking();
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+	char*			buf,	/*!< out: buffer */
+	ulint			buf_size,/*!< in: buffer size in bytes */
+	ulint			n,	/*!< in: number of field */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record */
+	const rec_offs*		offsets)/*!< in: record offsets, returned
+					by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		data_len;
+	dict_field_t*	dict_field;
+	ulint		ret;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	ret = 0;
+
+	if (n > 0) {
+		/* we must append ", " before the actual data */
+
+		if (buf_size < 3) {
+
+			buf[0] = '\0';
+			return(1);
+		}
+
+		memcpy(buf, ", ", 3);
+
+		buf += 2;
+		buf_size -= 2;
+		ret += 2;
+	}
+
+	/* now buf_size >= 1 */
+
+	data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+	dict_field = dict_index_get_nth_field(index, n);
+
+	ret += row_raw_format((const char*) data, data_len,
+			      dict_field, buf, buf_size);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+	const char**		lock_data,/*!< out: "lock_data" to fill */
+	const lock_t*		lock,	/*!< in: lock used to find the data */
+	ulint			heap_no,/*!< in: rec num used to find the data */
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
+					volatile data */
+{
+	ut_a(lock_get_type(lock) == LOCK_REC);
+
+	switch (heap_no) {
+	case PAGE_HEAP_NO_INFIMUM:
+	case PAGE_HEAP_NO_SUPREMUM:
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage,
+			heap_no == PAGE_HEAP_NO_INFIMUM
+			? "infimum pseudo-record"
+			: "supremum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+		return(*lock_data != NULL);
+	}
+
+	mtr_t			mtr;
+
+	const buf_block_t*	block;
+	const page_t*		page;
+	const rec_t*		rec;
+	const dict_index_t*	index;
+	ulint			n_fields;
+	mem_heap_t*		heap;
+	rec_offs		offsets_onstack[REC_OFFS_NORMAL_SIZE];
+	rec_offs*		offsets;
+	char			buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+	ulint			buf_used;
+	ulint			i;
+
+	mtr_start(&mtr);
+
+	block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
+
+	if (block == NULL) {
+
+		*lock_data = NULL;
+
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	page = reinterpret_cast<const page_t*>(buf_block_get_frame(block));
+
+	rec_offs_init(offsets_onstack);
+	offsets = offsets_onstack;
+
+	rec = page_find_rec_with_heap_no(page, heap_no);
+
+	index = lock_rec_get_index(lock);
+
+	n_fields = dict_index_get_n_unique(index);
+
+	ut_a(n_fields > 0);
+
+	heap = NULL;
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  n_fields, &heap);
+
+	/* format and store the data */
+
+	buf_used = 0;
+	for (i = 0; i < n_fields; i++) {
+
+		buf_used += put_nth_field(
+			buf + buf_used, sizeof(buf) - buf_used,
+			i, index, rec, offsets) - 1;
+	}
+
+	*lock_data = (const char*) ha_storage_put_memlim(
+		cache->storage, buf, buf_used + 1,
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	if (heap != NULL) {
+
+		/* this means that rec_get_offsets() has created a new
+		heap and has stored offsets in it; check that this is
+		really the case and free the heap */
+		ut_a(offsets != offsets_onstack);
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	if (*lock_data == NULL) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return false if allocation fails */
+static bool fill_locks_row(
+	i_s_locks_row_t* row,	/*!< out: result object that's filled */
+	const lock_t*	lock,	/*!< in: lock to get data from */
+	uint16_t	heap_no,/*!< in: lock's record number
+				or 0 if the lock
+				is a table lock */
+	trx_i_s_cache_t* cache)	/*!< in/out: cache into which to copy
+				volatile strings */
+{
+	row->lock_trx_id = lock->trx->id;
+	const auto lock_type = lock_get_type(lock);
+	ut_ad(lock_type == LOCK_REC || lock_type == LOCK_TABLE);
+
+	const bool is_gap_lock = lock_type == LOCK_REC
+		&& (lock->type_mode & LOCK_GAP);
+	switch (lock->type_mode & LOCK_MODE_MASK) {
+	case LOCK_S:
+		row->lock_mode = uint8_t(1 + is_gap_lock);
+		break;
+	case LOCK_X:
+		row->lock_mode = uint8_t(3 + is_gap_lock);
+		break;
+	case LOCK_IS:
+		row->lock_mode = uint8_t(5 + is_gap_lock);
+		break;
+	case LOCK_IX:
+		row->lock_mode = uint8_t(7 + is_gap_lock);
+		break;
+	case LOCK_AUTO_INC:
+		row->lock_mode = 9;
+		break;
+	default:
+		ut_ad("unknown lock mode" == 0);
+		row->lock_mode = 0;
+	}
+
+	row->lock_table = ha_storage_put_str_memlim(
+		cache->storage, lock_get_table_name(lock).m_name,
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	/* memory could not be allocated */
+	if (row->lock_table == NULL) {
+
+		return false;
+	}
+
+	if (lock_type == LOCK_REC) {
+		row->lock_index = ha_storage_put_str_memlim(
+			cache->storage, lock_rec_get_index_name(lock),
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		/* memory could not be allocated */
+		if (row->lock_index == NULL) {
+
+			return false;
+		}
+
+		row->lock_page = lock->un_member.rec_lock.page_id;
+		row->lock_rec = heap_no;
+
+		if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+			/* memory could not be allocated */
+			return false;
+		}
+	} else {
+		row->lock_index = NULL;
+
+		row->lock_page = page_id_t(0, 0);
+		row->lock_rec = 0;
+
+		row->lock_data = NULL;
+	}
+
+	row->lock_table_id = lock_get_table_id(lock);
+
+	row->hash_chain.value = row;
+	ut_ad(i_s_locks_row_validate(row));
+
+	return true;
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+	i_s_lock_waits_row_t*	row,		/*!< out: result object
+						that's filled */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	ut_ad(i_s_locks_row_validate(requested_lock_row));
+	ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+	row->requested_lock_row = requested_lock_row;
+	row->blocking_lock_row = blocking_lock_row;
+
+	return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+	const lock_t*	lock,	/*!< in: lock object to fold */
+	ulint		heap_no)/*!< in: lock's record number
+				or 0xFFFF if the lock
+				is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+	static ulint	fold = 0;
+
+	return(fold++);
+#else
+	ulint	ret;
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ut_a(heap_no != 0xFFFF);
+		ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+					 lock->un_member.rec_lock.page_id.
+					 fold());
+		ret = ut_fold_ulint_pair(ret, heap_no);
+
+		break;
+	case LOCK_TABLE:
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == 0xFFFF);
+
+		ret = (ulint) lock_get_table_id(lock);
+
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	const lock_t*		lock,	/*!< in: lock object */
+	ulint			heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
+					is a table lock */
+{
+	ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+	return(0);
+#else
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ut_a(heap_no != 0xFFFF);
+
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_page == lock->un_member.rec_lock.page_id
+		       && row->lock_rec == heap_no);
+
+	case LOCK_TABLE:
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == 0xFFFF);
+
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_table_id == lock_get_table_id(lock));
+
+	default:
+		ut_error;
+		return(FALSE);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	const lock_t*		lock,	/*!< in: lock to search for */
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
+					is a table lock */
+{
+	i_s_hash_chain_t*	hash_chain;
+
+	HASH_SEARCH(
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		&cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* the type of the next variable */
+		i_s_hash_chain_t*,
+		/* auxiliary variable */
+		hash_chain,
+		/* assertion on every traversed item */
+		ut_ad(i_s_locks_row_validate(hash_chain->value)),
+		/* this determines if we have found the lock */
+		locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+	if (hash_chain == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const lock_t*		lock,	/*!< in: the element to add */
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0 if the lock
+					is a table lock */
+{
+	i_s_locks_row_t*	dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	ulint	i;
+	for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+	/* quit if this lock is already present */
+	dst_row = search_innodb_locks(cache, lock, heap_no);
+	if (dst_row != NULL) {
+
+		ut_ad(i_s_locks_row_validate(dst_row));
+		return(dst_row);
+	}
+#endif
+
+	dst_row = (i_s_locks_row_t*)
+		table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(NULL);
+	}
+
+	if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+		/* memory could not be allocated */
+		cache->innodb_locks.rows_used--;
+		return(NULL);
+	}
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+	HASH_INSERT(
+		/* the type used in the hash chain */
+		i_s_hash_chain_t,
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		&cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* add this data to the hash */
+		&dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	} /* for()-loop */
+#endif
+
+	ut_ad(i_s_locks_row_validate(dst_row));
+	return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	i_s_lock_waits_row_t*	dst_row;
+
+	dst_row = (i_s_lock_waits_row_t*)
+		table_cache_create_empty_row(&cache->innodb_lock_waits,
+					     cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(FALSE);
+	}
+
+	fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const trx_t*		trx,	/*!< in: transaction */
+	i_s_locks_row_t**	requested_lock_row)/*!< out: pointer to the
+					requested lock row, or NULL or
+					undefined */
+{
+	ut_ad(lock_mutex_own());
+
+	/* If transaction is waiting we add the wait lock and all locks
+	from another transactions that are blocking the wait lock. */
+	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+		const lock_t*		curr_lock;
+		i_s_locks_row_t*	blocking_lock_row;
+		lock_queue_iterator_t	iter;
+
+		ut_a(trx->lock.wait_lock != NULL);
+
+		uint16_t wait_lock_heap_no
+			= wait_lock_get_heap_no(trx->lock.wait_lock);
+
+		/* add the requested lock */
+		*requested_lock_row
+			= add_lock_to_cache(cache, trx->lock.wait_lock,
+					    wait_lock_heap_no);
+
+		/* memory could not be allocated */
+		if (*requested_lock_row == NULL) {
+
+			return(FALSE);
+		}
+
+		/* then iterate over the locks before the wait lock and
+		add the ones that are blocking it */
+
+		lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
+					  ULINT_UNDEFINED);
+
+		for (curr_lock = lock_queue_iterator_get_prev(&iter);
+		     curr_lock != NULL;
+		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+			if (lock_has_to_wait(trx->lock.wait_lock,
+					     curr_lock)) {
+
+				/* add the lock that is
+				blocking trx->lock.wait_lock */
+				blocking_lock_row
+					= add_lock_to_cache(
+						cache, curr_lock,
+						/* heap_no is the same
+						for the wait and waited
+						locks */
+						wait_lock_heap_no);
+
+				/* memory could not be allocated */
+				if (blocking_lock_row == NULL) {
+
+					return(FALSE);
+				}
+
+				/* add the relation between both locks
+				to innodb_lock_waits */
+				if (!add_lock_wait_to_cache(
+						cache, *requested_lock_row,
+						blocking_lock_row)) {
+
+					/* memory could not be allocated */
+					return(FALSE);
+				}
+			}
+		}
+	} else {
+
+		*requested_lock_row = NULL;
+	}
+
+	return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in nanoseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_NS	100000000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return whether the cache can be updated */
+static bool can_cache_be_updated(trx_i_s_cache_t* cache)
+{
+	/* cache->last_read is only updated when a shared rw lock on the
+	whole cache is being held (see trx_i_s_cache_end_read()) and
+	we are currently holding an exclusive rw lock on the cache.
+	So it is not possible for last_read to be updated while we are
+	reading it. */
+
+	ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
+
+	return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to clear */
+{
+	cache->innodb_trx.rows_used = 0;
+	cache->innodb_locks.rows_used = 0;
+	cache->innodb_lock_waits.rows_used = 0;
+
+	cache->locks_hash.clear();
+
+	ha_storage_empty(&cache->storage);
+}
+
+
+/**
+  Add transactions to innodb_trx's cache.
+
+  We also add all locks that are relevant to each transaction into
+  innodb_locks' and innodb_lock_waits' caches.
+*/
+
+static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
+{
+  i_s_locks_row_t *requested_lock_row;
+
+#ifdef UNIV_DEBUG
+  {
+    const auto state= trx->state;
+
+    if (trx->is_autocommit_non_locking())
+    {
+      ut_ad(trx->read_only);
+      ut_ad(!trx->is_recovered);
+      ut_ad(trx->mysql_thd);
+      ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE);
+    }
+    else
+      ut_ad(state == TRX_STATE_ACTIVE ||
+            state == TRX_STATE_PREPARED ||
+            state == TRX_STATE_PREPARED_RECOVERED ||
+            state == TRX_STATE_COMMITTED_IN_MEMORY);
+  }
+#endif /* UNIV_DEBUG */
+
+  if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row))
+  {
+    if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>(
+        table_cache_create_empty_row(&cache->innodb_trx, cache)))
+    {
+      if (fill_trx_row(trx_row, trx, requested_lock_row, cache))
+        return;
+      --cache->innodb_trx.rows_used;
+    }
+  }
+
+  /* memory could not be allocated */
+  cache->is_truncated= true;
+}
+
+
+/**
+  Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+  table cache buffer. Cache must be locked for write.
+*/
+
+static void fetch_data_into_cache(trx_i_s_cache_t *cache)
+{
+  ut_ad(lock_mutex_own());
+  trx_i_s_cache_clear(cache);
+
+  /* Capture the state of transactions */
+  trx_sys.trx_list.for_each([cache](trx_t &trx) {
+    if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+        &trx != purge_sys.query->trx)
+    {
+      mutex_enter(&trx.mutex);
+      if (trx.state != TRX_STATE_NOT_STARTED)
+        fetch_data_into_cache_low(cache, &trx);
+      mutex_exit(&trx.mutex);
+    }
+  });
+  cache->is_truncated= false;
+}
+
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	if (!can_cache_be_updated(cache)) {
+
+		return(1);
+	}
+
+	/* We need to read trx_sys and record/table lock queues */
+
+	lock_mutex_enter();
+	fetch_data_into_cache(cache);
+	lock_mutex_exit();
+
+	/* update cache last read time */
+	cache->last_read = my_interval_timer();
+
+	return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to init */
+{
+	/* The latching is done in the following order:
+	acquire trx_i_s_cache_t::rw_lock, X
+	acquire lock mutex
+	release lock mutex
+	release trx_i_s_cache_t::rw_lock
+	acquire trx_i_s_cache_t::rw_lock, S
+	release trx_i_s_cache_t::rw_lock */
+
+	rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
+		       SYNC_TRX_I_S_RWLOCK);
+
+	cache->last_read = 0;
+
+	table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+	table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+	table_cache_init(&cache->innodb_lock_waits,
+			 sizeof(i_s_lock_waits_row_t));
+
+	cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
+
+	cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+					   CACHE_STORAGE_HASH_CELLS);
+
+	cache->mem_allocd = 0;
+
+	cache->is_truncated = false;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< in, own: cache to free */
+{
+	rw_lock_free(&cache->rw_lock);
+
+	cache->locks_hash.free();
+	ha_storage_free(cache->storage);
+	table_cache_free(&cache->innodb_trx);
+	table_cache_free(&cache->innodb_locks);
+	table_cache_free(&cache->innodb_lock_waits);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->last_read = my_interval_timer();
+	rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
+
+	rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+	trx_i_s_cache_t*	cache,	/*!< in: whole cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	ut_ad(rw_lock_own_flagged(&cache->rw_lock,
+				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+
+	switch (table) {
+	case I_S_INNODB_TRX:
+		return &cache->innodb_trx;
+	case I_S_INNODB_LOCKS:
+		return &cache->innodb_locks;
+	case I_S_INNODB_LOCK_WAITS:
+		return &cache->innodb_lock_waits;
+	}
+
+	ut_error;
+	return NULL;
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+	table_cache = cache_select_table(cache, table);
+
+	return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n)	/*!< in: row number */
+{
+	i_s_table_cache_t*	table_cache;
+	ulint			i;
+	void*			row;
+
+	table_cache = cache_select_table(cache, table);
+
+	ut_a(n < table_cache->rows_used);
+
+	row = NULL;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		if (table_cache->chunks[i].offset
+		    + table_cache->chunks[i].rows_allocd > n) {
+
+			row = (char*) table_cache->chunks[i].base
+				+ (n - table_cache->chunks[i].offset)
+				* table_cache->row_size;
+			break;
+		}
+	}
+
+	ut_a(row != NULL);
+
+	return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size)/*!< in: size of the lock id
+					buffer */
+{
+	int	res_len;
+
+	/* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+	if (row->lock_index) {
+		/* record lock */
+		res_len = snprintf(lock_id, lock_id_size,
+				   TRX_ID_FMT
+				   ":%u:%u:%u",
+				   row->lock_trx_id, row->lock_page.space(),
+				   row->lock_page.page_no(), row->lock_rec);
+	} else {
+		/* table lock */
+		res_len = snprintf(lock_id, lock_id_size,
+				   TRX_ID_FMT":" UINT64PF,
+				   row->lock_trx_id,
+				   row->lock_table_id);
+	}
+
+	/* the typecast is safe because snprintf(3) never returns
+	negative result */
+	ut_a(res_len >= 0);
+	ut_a((ulint) res_len < lock_id_size);
+
+	return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000..28491853
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+#include "fsp0fsp.h"
+#include "fut0fut.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "os0thread.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "sync0sync.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include <mysql/service_wsrep.h>
+
+#include <unordered_map>
+
+/** Maximum allowable purge history length.  <=0 means 'infinite'. */
+ulong		srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+ulong		srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+purge_sys_t	purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+trx_undo_rec_t	trx_purge_dummy_rec;
+
+#ifdef UNIV_DEBUG
+my_bool		srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/** Sentinel value */
+static const TrxUndoRsegs NullElement;
+
+/** Default constructor */
+TrxUndoRsegsIterator::TrxUndoRsegsIterator()
+	: m_rsegs(NullElement), m_iter(m_rsegs.begin())
+{
+}
+
+/** Sets the next rseg to purge in purge_sys.
+Executed in the purge coordinator thread.
+@return whether anything is to be purged */
+inline bool TrxUndoRsegsIterator::set_next()
+{
+	mutex_enter(&purge_sys.pq_mutex);
+
+	/* Only purge consumes events from the priority queue, user
+	threads only produce the events. */
+
+	/* Check if there are more rsegs to process in the
+	current element. */
+	if (m_iter != m_rsegs.end()) {
+		/* We are still processing rollback segment from
+		the same transaction and so expected transaction
+		number shouldn't increase. Undo the increment of
+		expected commit done by caller assuming rollback
+		segments from given transaction are done. */
+		purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
+	} else if (!purge_sys.purge_queue.empty()) {
+		m_rsegs = purge_sys.purge_queue.top();
+		purge_sys.purge_queue.pop();
+		ut_ad(purge_sys.purge_queue.empty()
+		      || purge_sys.purge_queue.top() != m_rsegs);
+		m_iter = m_rsegs.begin();
+	} else {
+		/* Queue is empty, reset iterator. */
+		purge_sys.rseg = NULL;
+		mutex_exit(&purge_sys.pq_mutex);
+		m_rsegs = NullElement;
+		m_iter = m_rsegs.begin();
+		return false;
+	}
+
+	purge_sys.rseg = *m_iter++;
+	mutex_exit(&purge_sys.pq_mutex);
+	mutex_enter(&purge_sys.rseg->mutex);
+
+	ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+	ut_ad(purge_sys.rseg->last_trx_no() == m_rsegs.trx_no);
+
+	/* We assume in purge of externally stored fields that space id is
+	in the range of UNDO tablespace space ids */
+	ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
+	      || srv_is_undo_tablespace(purge_sys.rseg->space->id));
+
+	ut_a(purge_sys.tail.trx_no <= purge_sys.rseg->last_trx_no());
+
+	purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no();
+	purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+	purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+	mutex_exit(&purge_sys.rseg->mutex);
+
+	return(true);
+}
+
+/** Build a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+purge_graph_build()
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	trx_t* trx = trx_create();
+	ut_ad(!trx->id);
+	trx->start_time = time(NULL);
+	trx->start_time_micro = microsecond_interval_timer();
+	trx->state = TRX_STATE_ACTIVE;
+	trx->op_info = "purge trx";
+
+	mem_heap_t*	heap = mem_heap_create(512);
+	que_fork_t*	fork = que_fork_create(
+		NULL, NULL, QUE_FORK_PURGE, heap);
+	fork->trx = trx;
+
+	for (auto i = innodb_purge_threads_MAX; i; i--) {
+		que_thr_t*	thr = que_thr_create(fork, heap, NULL);
+		thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
+			purge_node_t(thr);
+	}
+
+	return(fork);
+}
+
+/** Initialise the purge system. */
+void purge_sys_t::create()
+{
+  ut_ad(this == &purge_sys);
+  ut_ad(!heap);
+  ut_ad(!enabled());
+  m_paused= 0;
+  query= purge_graph_build();
+  next_stored= false;
+  rseg= NULL;
+  page_no= 0;
+  offset= 0;
+  hdr_page_no= 0;
+  hdr_offset= 0;
+  rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH);
+  mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex);
+  truncate.current= NULL;
+  truncate.last= NULL;
+  heap= mem_heap_create(4096);
+}
+
+/** Close the purge subsystem on shutdown. */
+void purge_sys_t::close()
+{
+  ut_ad(this == &purge_sys);
+  if (!heap)
+    return;
+
+  ut_ad(!enabled());
+  trx_t* trx = query->trx;
+  que_graph_free(query);
+  ut_ad(!trx->id);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  trx->state= TRX_STATE_NOT_STARTED;
+  trx->free();
+  rw_lock_free(&latch);
+  mutex_free(&pq_mutex);
+  mem_heap_free(heap);
+  heap= nullptr;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in,out]	mtr		mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
+{
+	DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
+			   trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
+	ut_ad(undo == trx->rsegs.m_redo.undo);
+	trx_rseg_t*	rseg		= trx->rsegs.m_redo.rseg;
+	ut_ad(undo->rseg == rseg);
+	buf_block_t*	rseg_header	= trx_rsegf_get(
+		rseg->space, rseg->page_no, mtr);
+	buf_block_t*	undo_page	= trx_undo_set_state_at_finish(
+		undo, mtr);
+	trx_ulogf_t*	undo_header	= undo_page->frame + undo->hdr_offset;
+
+	ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+					   + rseg_header->frame))) {
+		/* This database must have been upgraded from
+		before MariaDB 10.3.5. */
+		trx_rseg_format_upgrade(rseg_header, mtr);
+	}
+
+	if (undo->state != TRX_UNDO_CACHED) {
+		/* The undo log segment will not be reused */
+		ut_a(undo->id < TRX_RSEG_N_SLOTS);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+			    + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
+		uint32_t hist_size = mach_read_from_4(TRX_RSEG_HISTORY_SIZE
+						      + TRX_RSEG
+						      + rseg_header->frame);
+
+		ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR
+						 + TRX_UNDO_PAGE_LIST
+						 + undo_page->frame));
+
+		mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			      + rseg_header->frame,
+			      hist_size + undo->size);
+		mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+			      + rseg_header->frame,
+			      trx_sys.get_max_trx_id());
+	}
+
+	/* After the purge thread has been given permission to exit,
+	we may roll back transactions (trx->undo_no==0)
+	in THD::cleanup() invoked from unlink_thd() in fast shutdown,
+	or in trx_rollback_recovered() in slow shutdown.
+
+	Before any transaction-generating background threads or the
+	purge have been started, we can
+	start transactions in row_merge_drop_temp_indexes() and
+	fts_drop_orphaned_tables(), and roll back recovered transactions.
+
+	Arbitrary user transactions may be executed when all the undo log
+	related background processes (including purge) are disabled due to
+	innodb_force_recovery=2 or innodb_force_recovery=3.
+	DROP TABLE may be executed at any innodb_force_recovery	level.
+
+	During fast shutdown, we may also continue to execute
+	user transactions. */
+	ut_ad(srv_undo_sources
+	      || trx->undo_no == 0
+	      || (!purge_sys.enabled()
+		  && (srv_is_being_started
+		      || trx_rollback_is_active
+		      || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+	      || ((trx->mysql_thd || trx->internal)
+		  && srv_fast_shutdown));
+
+#ifdef	WITH_WSREP
+	if (wsrep_is_wsrep_xid(trx->xid)) {
+		trx_rseg_update_wsrep_checkpoint(rseg_header, trx->xid, mtr);
+	}
+#endif
+
+	if (trx->mysql_log_file_name && *trx->mysql_log_file_name) {
+		/* Update the latest MySQL binlog name and offset info
+		in rollback segment header if MySQL binlogging is on
+		or the database server is a MySQL replication save. */
+		trx_rseg_update_binlog_offset(rseg_header, trx, mtr);
+	}
+
+	/* Add the log as the first in the history list */
+	flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+		       static_cast<uint16_t>(undo->hdr_offset
+					     + TRX_UNDO_HISTORY_NODE), mtr);
+
+	mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page,
+				       undo_header + TRX_UNDO_TRX_NO,
+				       trx->rw_trx_hash_element->no);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header
+				       + TRX_UNDO_NEEDS_PURGE, 1U);
+
+	if (rseg->last_page_no == FIL_NULL) {
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->set_last_commit(undo->hdr_offset,
+				      trx->rw_trx_hash_element->no);
+		rseg->needs_purge = true;
+	}
+
+	trx_sys.rseg_history_len++;
+
+	if (undo->state == TRX_UNDO_CACHED) {
+		UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+		ut_free(undo);
+	}
+
+	undo = NULL;
+}
+
+/** Remove undo log header from the history list.
+@param[in,out]  rseg    rollback segment header page
+@param[in]      log     undo log segment header page
+@param[in]      offset  byte offset in the undo log segment header page
+@param[in,out]  mtr     mini-transaction */
+static void trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
+                                     uint16_t offset, mtr_t *mtr)
+{
+  flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY,
+              log, static_cast<uint16_t>(offset + TRX_UNDO_HISTORY_NODE), mtr);
+  trx_sys.rseg_history_len--;
+}
+
+/** Free an undo log segment, and remove the header from the history list.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_addr	file address of log_hdr */
+static
+void
+trx_purge_free_segment(trx_rseg_t* rseg, fil_addr_t hdr_addr)
+{
+	mtr_t		mtr;
+
+	mtr.start();
+	mutex_enter(&rseg->mutex);
+
+	buf_block_t* rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+	buf_block_t* block = trx_undo_page_get(
+		page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+
+	/* Mark the last undo log totally purged, so that if the
+	system crashes, the tail of the undo log will not get accessed
+	again. The list of pages in the undo log tail gets
+	inconsistent during the freeing of the segment, and therefore
+	purge should not try to access them again. */
+	mtr.write<2,mtr_t::MAYBE_NOP>(*block, block->frame + hdr_addr.boffset
+				      + TRX_UNDO_NEEDS_PURGE, 0U);
+
+	while (!fseg_free_step_not_header(
+		       TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+		       + block->frame, &mtr)) {
+		mutex_exit(&rseg->mutex);
+
+		mtr.commit();
+		mtr.start();
+
+		mutex_enter(&rseg->mutex);
+
+		rseg_hdr = trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
+
+		block = trx_undo_page_get(
+			page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+	}
+
+	/* The page list may now be inconsistent, but the length field
+	stored in the list base node tells us how big it was before we
+	started the freeing. */
+
+	const uint32_t seg_size = flst_get_len(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+
+	/* We may free the undo log segment header page; it must be freed
+	within the same mtr as the undo log header is removed from the
+	history list: otherwise, in case of a database crash, the segment
+	could become inaccessible garbage in the file space. */
+
+	trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
+
+	do {
+
+		/* Here we assume that a file segment with just the header
+		page can be freed in a few steps, so that the buffer pool
+		is not flooded with bufferfixed pages: see the note in
+		fsp0fsp.cc. */
+
+	} while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+				 + block->frame, &mtr));
+
+	byte* hist = TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->frame;
+	ut_ad(mach_read_from_4(hist) >= seg_size);
+
+	mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
+
+	ut_ad(rseg->curr_size >= seg_size);
+
+	rseg->curr_size -= seg_size;
+
+	mutex_exit(&(rseg->mutex));
+
+	mtr_commit(&mtr);
+}
+
+/** Remove unnecessary history data from a rollback segment.
+@param[in,out]	rseg		rollback segment
+@param[in]	limit		truncate anything before this */
+static
+void
+trx_purge_truncate_rseg_history(
+	trx_rseg_t&			rseg,
+	const purge_sys_t::iterator&	limit)
+{
+	fil_addr_t	hdr_addr;
+	fil_addr_t	prev_hdr_addr;
+	mtr_t		mtr;
+	trx_id_t	undo_trx_no;
+
+	mtr.start();
+	ut_ad(rseg.is_persistent());
+	mutex_enter(&rseg.mutex);
+
+	buf_block_t* rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+	hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY
+				 + rseg_hdr->frame);
+	hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset
+						 - TRX_UNDO_HISTORY_NODE);
+
+loop:
+	if (hdr_addr.page == FIL_NULL) {
+func_exit:
+		mutex_exit(&rseg.mutex);
+		mtr.commit();
+		return;
+	}
+
+	buf_block_t* block = trx_undo_page_get(page_id_t(rseg.space->id,
+							 hdr_addr.page),
+					       &mtr);
+	undo_trx_no = mach_read_from_8(block->frame + hdr_addr.boffset
+				       + TRX_UNDO_TRX_NO);
+
+	if (undo_trx_no >= limit.trx_no) {
+		if (undo_trx_no == limit.trx_no) {
+			trx_undo_truncate_start(
+				&rseg, hdr_addr.page,
+				hdr_addr.boffset, limit.undo_no);
+		}
+
+		goto func_exit;
+	}
+
+	prev_hdr_addr = flst_get_prev_addr(block->frame + hdr_addr.boffset
+					   + TRX_UNDO_HISTORY_NODE);
+	prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset
+						      - TRX_UNDO_HISTORY_NODE);
+
+	if (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame)
+	    == TRX_UNDO_TO_PURGE
+	    && !mach_read_from_2(block->frame + hdr_addr.boffset
+				 + TRX_UNDO_NEXT_LOG)) {
+
+		/* We can free the whole log segment */
+
+		mutex_exit(&rseg.mutex);
+		mtr.commit();
+
+		/* calls the trx_purge_remove_log_hdr()
+		inside trx_purge_free_segment(). */
+		trx_purge_free_segment(&rseg, hdr_addr);
+	} else {
+		/* Remove the log hdr from the rseg history. */
+		trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset,
+					 &mtr);
+
+		mutex_exit(&rseg.mutex);
+		mtr.commit();
+	}
+
+	mtr.start();
+	mutex_enter(&rseg.mutex);
+
+	rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+
+	hdr_addr = prev_hdr_addr;
+
+	goto loop;
+}
+
+/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+marked for truncate.
+@param[in]	space	undo tablespace being truncated */
+static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+{
+	typedef	std::vector<TrxUndoRsegs>	purge_elem_list_t;
+	purge_elem_list_t			purge_elem_list;
+
+	mutex_enter(&purge_sys.pq_mutex);
+
+	/* Remove rseg instances that are in the purge queue before we start
+	truncate of corresponding UNDO truncate. */
+	while (!purge_sys.purge_queue.empty()) {
+		purge_elem_list.push_back(purge_sys.purge_queue.top());
+		purge_sys.purge_queue.pop();
+	}
+
+	for (purge_elem_list_t::iterator it = purge_elem_list.begin();
+	     it != purge_elem_list.end();
+	     ++it) {
+
+		for (TrxUndoRsegs::iterator it2 = it->begin();
+		     it2 != it->end();
+		     ++it2) {
+			if ((*it2)->space == &space) {
+				it->erase(it2);
+				break;
+			}
+		}
+
+		if (!it->empty()) {
+			purge_sys.purge_queue.push(*it);
+		}
+	}
+
+	mutex_exit(&purge_sys.pq_mutex);
+}
+
+/**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages!
+*/
+static void trx_purge_truncate_history()
+{
+	ut_ad(purge_sys.head <= purge_sys.tail);
+	purge_sys_t::iterator& head = purge_sys.head.trx_no
+		? purge_sys.head : purge_sys.tail;
+
+	if (head.trx_no >= purge_sys.low_limit_no()) {
+		/* This is sometimes necessary. TODO: find out why. */
+		head.trx_no = purge_sys.low_limit_no();
+		head.undo_no = 0;
+	}
+
+	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+			ut_ad(rseg->id == i);
+			trx_purge_truncate_rseg_history(*rseg, head);
+		}
+	}
+
+	if (srv_undo_tablespaces_active < 2) {
+		return;
+	}
+
+	while (srv_undo_log_truncate) {
+		if (!purge_sys.truncate.current) {
+			const ulint threshold = ulint(srv_max_undo_log_size
+						      >> srv_page_size_shift);
+			for (ulint i = purge_sys.truncate.last
+				     ? purge_sys.truncate.last->id
+				     - srv_undo_space_id_start
+				     : 0, j = i;; ) {
+				ulint space_id = srv_undo_space_id_start + i;
+				ut_ad(srv_is_undo_tablespace(space_id));
+				fil_space_t* space= fil_space_get(space_id);
+
+				if (space && space->get_size() > threshold) {
+					purge_sys.truncate.current = space;
+					break;
+				}
+
+				++i;
+				i %= srv_undo_tablespaces_active;
+				if (i == j) {
+					break;
+				}
+			}
+		}
+
+		if (!purge_sys.truncate.current) {
+			return;
+		}
+
+		fil_space_t& space = *purge_sys.truncate.current;
+		/* Undo tablespace always are a single file. */
+		ut_a(UT_LIST_GET_LEN(space.chain) == 1);
+		fil_node_t* file = UT_LIST_GET_FIRST(space.chain);
+		/* The undo tablespace files are never closed. */
+		ut_ad(file->is_open());
+
+		DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+			if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+				ut_ad(rseg->is_persistent());
+				if (rseg->space == &space) {
+					/* Once set, this rseg will
+					not be allocated to subsequent
+					transactions, but we will wait
+					for existing active
+					transactions to finish. */
+					rseg->skip_allocation = true;
+				}
+			}
+		}
+
+		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+			trx_rseg_t*	rseg = trx_sys.rseg_array[i];
+			if (!rseg || rseg->space != &space) {
+				continue;
+			}
+			mutex_enter(&rseg->mutex);
+			ut_ad(rseg->skip_allocation);
+			if (rseg->trx_ref_count) {
+not_free:
+				mutex_exit(&rseg->mutex);
+				return;
+			}
+
+			if (rseg->curr_size != 1) {
+				/* Check if all segments are
+				cached and safe to remove. */
+				ulint cached = 0;
+
+				for (trx_undo_t* undo = UT_LIST_GET_FIRST(
+					     rseg->undo_cached);
+				     undo;
+				     undo = UT_LIST_GET_NEXT(undo_list,
+							     undo)) {
+					if (head.trx_no < undo->trx_id) {
+						goto not_free;
+					} else {
+						cached += undo->size;
+					}
+				}
+
+				ut_ad(rseg->curr_size > cached);
+
+				if (rseg->curr_size > cached + 1) {
+					goto not_free;
+				}
+			}
+
+			mutex_exit(&rseg->mutex);
+		}
+
+		ib::info() << "Truncating " << file->name;
+		trx_purge_cleanse_purge_queue(space);
+
+		/* Flush all to-be-discarded pages of the tablespace.
+
+		During truncation, we do not want any writes to the
+		to-be-discarded area, because we must set the space.size
+		early in order to have deterministic page allocation.
+
+		If a log checkpoint was completed at LSN earlier than our
+		mini-transaction commit and the server was killed, then
+		discarding the to-be-trimmed pages without flushing would
+		break crash recovery. So, we cannot avoid the write. */
+		while (buf_flush_list_space(&space));
+
+		log_free_check();
+
+		/* Adjust the tablespace metadata. */
+		if (!fil_truncate_prepare(space.id)) {
+			ib::error() << "Failed to find UNDO tablespace "
+				<< file->name;
+			return;
+		}
+
+		/* Re-initialize tablespace, in a single mini-transaction. */
+		mtr_t mtr;
+		const ulint size = SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+		mtr.start();
+		mtr_x_lock_space(purge_sys.truncate.current, &mtr);
+		/* Associate the undo tablespace with mtr.
+		During mtr::commit(), InnoDB can use the undo
+		tablespace object to clear all freed ranges */
+		mtr.set_named_space(purge_sys.truncate.current);
+		mtr.trim_pages(page_id_t(space.id, size));
+		fsp_header_init(purge_sys.truncate.current, size, &mtr);
+		mutex_enter(&fil_system.mutex);
+		purge_sys.truncate.current->size = file->size = size;
+		mutex_exit(&fil_system.mutex);
+
+		buf_block_t* sys_header = trx_sysf_get(&mtr);
+
+		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+			trx_rseg_t* rseg = trx_sys.rseg_array[i];
+			if (!rseg || rseg->space != &space) {
+				continue;
+			}
+
+			ut_ad(rseg->is_persistent());
+			ut_d(const ulint old_page = rseg->page_no);
+
+			buf_block_t* rblock = trx_rseg_header_create(
+				purge_sys.truncate.current,
+				rseg->id, sys_header, &mtr);
+			ut_ad(rblock);
+			rseg->page_no = rblock
+				? rblock->page.id().page_no() : FIL_NULL;
+			ut_ad(old_page == rseg->page_no);
+
+			/* Before re-initialization ensure that we
+			free the existing structure. There can't be
+			any active transactions. */
+			ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+			trx_undo_t*	next_undo;
+
+			for (trx_undo_t* undo = UT_LIST_GET_FIRST(
+				     rseg->undo_cached);
+			     undo; undo = next_undo) {
+
+				next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+				UT_LIST_REMOVE(rseg->undo_cached, undo);
+				MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+				ut_free(undo);
+			}
+
+			UT_LIST_INIT(rseg->undo_list,
+				     &trx_undo_t::undo_list);
+			UT_LIST_INIT(rseg->undo_cached,
+				     &trx_undo_t::undo_list);
+
+			/* These were written by trx_rseg_header_create(). */
+			ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+						+ rblock->frame));
+			ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+						+ rblock->frame));
+
+			/* Initialize the undo log lists according to
+			the rseg header */
+			rseg->curr_size = 1;
+			rseg->trx_ref_count = 0;
+			rseg->last_page_no = FIL_NULL;
+			rseg->last_commit_and_offset = 0;
+			rseg->needs_purge = false;
+		}
+
+		mtr.commit();
+		/* Write-ahead the redo log record. */
+		log_write_up_to(mtr.commit_lsn(), true);
+
+		/* Trim the file size. */
+		os_file_truncate(file->name, file->handle,
+				 os_offset_t(size) << srv_page_size_shift,
+				 true);
+
+		/* This is only executed by srv_purge_coordinator_thread. */
+		export_vars.innodb_undo_truncations++;
+
+		/* In MDEV-8319 (10.5) we will PUNCH_HOLE the garbage
+		(with write-ahead logging). */
+		mutex_enter(&fil_system.mutex);
+		ut_ad(&space == purge_sys.truncate.current);
+		ut_ad(space.is_being_truncated);
+		purge_sys.truncate.current->set_stopping(false);
+		purge_sys.truncate.current->is_being_truncated = false;
+		mutex_exit(&fil_system.mutex);
+
+		if (purge_sys.rseg != NULL
+		    && purge_sys.rseg->last_page_no == FIL_NULL) {
+			/* If purge_sys.rseg is pointing to rseg that
+			was recently truncated then move to next rseg
+			element.  Note: Ideally purge_sys.rseg should
+			be NULL because purge should complete
+			processing of all the records but there is
+			purge_batch_size that can force the purge loop
+			to exit before all the records are purged and
+			in this case purge_sys.rseg could point to a
+			valid rseg waiting for next purge cycle. */
+			purge_sys.next_stored = false;
+			purge_sys.rseg = NULL;
+		}
+
+		DBUG_EXECUTE_IF("ib_undo_trunc",
+				ib::info() << "ib_undo_trunc";
+				log_buffer_flush_to_disk();
+				DBUG_SUICIDE(););
+
+		for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+			if (trx_rseg_t* rseg = trx_sys.rseg_array[i]) {
+				ut_ad(rseg->is_persistent());
+				if (rseg->space == &space) {
+					rseg->skip_allocation = false;
+				}
+			}
+		}
+
+		ib::info() << "Truncated " << file->name;
+		purge_sys.truncate.last = purge_sys.truncate.current;
+		purge_sys.truncate.current = NULL;
+	}
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys.purge_trx_no past the purged log. */
+static void trx_purge_rseg_get_next_history_log(
+	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
+					handled */
+{
+	fil_addr_t	prev_log_addr;
+	trx_id_t	trx_no;
+	mtr_t		mtr;
+
+	mutex_enter(&purge_sys.rseg->mutex);
+
+	ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
+
+	purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no() + 1;
+	purge_sys.tail.undo_no = 0;
+	purge_sys.next_stored = false;
+
+	mtr.start();
+
+	const buf_block_t* undo_page = trx_undo_page_get_s_latched(
+		page_id_t(purge_sys.rseg->space->id,
+			  purge_sys.rseg->last_page_no), &mtr);
+
+	const trx_ulogf_t* log_hdr = undo_page->frame
+		+ purge_sys.rseg->last_offset();
+
+	/* Increase the purge page count by one for every handled log */
+
+	(*n_pages_handled)++;
+
+	prev_log_addr = flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+	prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset
+						      - TRX_UNDO_HISTORY_NODE);
+
+
+	const bool empty = prev_log_addr.page == FIL_NULL;
+
+	if (empty) {
+		/* No logs left in the history list */
+		purge_sys.rseg->last_page_no = FIL_NULL;
+	}
+
+	mutex_exit(&purge_sys.rseg->mutex);
+	mtr.commit();
+
+	if (empty) {
+		return;
+	}
+
+	/* Read the previous log header. */
+	mtr.start();
+
+	log_hdr = trx_undo_page_get_s_latched(
+		page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
+		&mtr)->frame
+		+ prev_log_addr.boffset;
+
+	trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+	ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+
+	mtr_commit(&mtr);
+
+	mutex_enter(&purge_sys.rseg->mutex);
+
+	purge_sys.rseg->last_page_no = prev_log_addr.page;
+	purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+	purge_sys.rseg->needs_purge = log_hdr[TRX_UNDO_NEEDS_PURGE + 1] != 0;
+
+	/* Purge can also produce events, however these are already ordered
+	in the rollback segment and any user generated event will be greater
+	than the events that Purge produces. ie. Purge can never produce
+	events from an empty rollback segment. */
+
+	mutex_enter(&purge_sys.pq_mutex);
+
+	purge_sys.purge_queue.push(*purge_sys.rseg);
+
+	mutex_exit(&purge_sys.pq_mutex);
+
+	mutex_exit(&purge_sys.rseg->mutex);
+}
+
+/** Position the purge sys "iterator" on the undo record to use for purging. */
+static void trx_purge_read_undo_rec()
+{
+	uint16_t	offset;
+	uint32_t	page_no;
+	ib_uint64_t	undo_no;
+
+	purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+	page_no = purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+	if (purge_sys.rseg->needs_purge) {
+		mtr_t		mtr;
+		mtr.start();
+		buf_block_t* undo_page;
+		if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec(
+			    *purge_sys.rseg->space, purge_sys.hdr_page_no,
+			    purge_sys.hdr_offset, RW_S_LATCH,
+			    undo_page, &mtr)) {
+
+			offset = page_offset(undo_rec);
+			undo_no = trx_undo_rec_get_undo_no(undo_rec);
+			page_no = undo_page->page.id().page_no();
+		} else {
+			offset = 0;
+			undo_no = 0;
+		}
+
+		mtr.commit();
+	} else {
+		offset = 0;
+		undo_no = 0;
+	}
+
+	purge_sys.offset = offset;
+	purge_sys.page_no = page_no;
+	purge_sys.tail.undo_no = undo_no;
+
+	purge_sys.next_stored = true;
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+	ut_ad(!purge_sys.next_stored);
+
+	if (purge_sys.rseg_iter.set_next()) {
+		trx_purge_read_undo_rec();
+	} else {
+		/* There is nothing to do yet. */
+		os_thread_yield();
+	}
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	mtr_t		mtr;
+
+	ut_ad(purge_sys.next_stored);
+	ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no());
+
+	const ulint space = purge_sys.rseg->space->id;
+	const uint32_t page_no = purge_sys.page_no;
+	const uint16_t offset = purge_sys.offset;
+
+	if (offset == 0) {
+		/* It is the dummy undo log record, which means that there is
+		no need to purge this undo log */
+
+		trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		return(&trx_purge_dummy_rec);
+	}
+
+	mtr_start(&mtr);
+
+	buf_block_t* undo_page = trx_undo_page_get_s_latched(
+		page_id_t(space, page_no), &mtr);
+	buf_block_t* rec2_page = undo_page;
+
+	const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec(
+		undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset);
+
+	if (rec2 == NULL) {
+		rec2 = trx_undo_get_next_rec(rec2_page, offset,
+					     purge_sys.hdr_page_no,
+					     purge_sys.hdr_offset, &mtr);
+	}
+
+	if (rec2 == NULL) {
+		mtr_commit(&mtr);
+
+		trx_purge_rseg_get_next_history_log(n_pages_handled);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		mtr_start(&mtr);
+
+		undo_page = trx_undo_page_get_s_latched(
+			page_id_t(space, page_no), &mtr);
+	} else {
+		purge_sys.offset = page_offset(rec2);
+		purge_sys.page_no = rec2_page->page.id().page_no();
+		purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2);
+
+		if (undo_page != rec2_page) {
+			/* We advance to a new page of the undo log: */
+			(*n_pages_handled)++;
+		}
+	}
+
+	trx_undo_rec_t*	rec_copy = trx_undo_rec_copy(undo_page->frame + offset,
+						     heap);
+
+	mtr_commit(&mtr);
+
+	return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+static MY_ATTRIBUTE((warn_unused_result))
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+	roll_ptr_t*	roll_ptr,	/*!< out: roll pointer to undo record */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO log pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	if (!purge_sys.next_stored) {
+		trx_purge_choose_next_log();
+
+		if (!purge_sys.next_stored) {
+			DBUG_PRINT("ib_purge",
+				   ("no logs left in the history list"));
+			return(NULL);
+		}
+	}
+
+	if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) {
+
+		return(NULL);
+	}
+
+	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+	os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
+
+	*roll_ptr = trx_undo_build_roll_ptr(
+		/* row_purge_record_func() will later set
+		ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */
+		false,
+		purge_sys.rseg->id,
+		purge_sys.page_no, purge_sys.offset);
+
+	/* The following call will advance the stored values of the
+	purge iterator. */
+
+	return(trx_purge_get_next_rec(n_pages_handled, heap));
+}
+
+/** Run a purge batch.
+@param n_purge_threads	number of purge threads
+@return number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(ulint n_purge_threads)
+{
+	que_thr_t*	thr;
+	ulint		i;
+	ulint		n_pages_handled = 0;
+	ulint		n_thrs = UT_LIST_GET_LEN(purge_sys.query->thrs);
+
+	ut_a(n_purge_threads > 0);
+
+	purge_sys.head = purge_sys.tail;
+
+#ifdef UNIV_DEBUG
+	i = 0;
+	/* Debug code to validate some pre-requisites and reset done flag. */
+	for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+	     thr != NULL && i < n_purge_threads;
+	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+		purge_node_t*		node;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+
+		ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+		ut_ad(node->undo_recs.empty());
+		ut_ad(!node->in_progress);
+		ut_d(node->in_progress = true);
+	}
+
+	/* There should never be fewer nodes than threads, the inverse
+	however is allowed because we only use purge threads as needed. */
+	ut_ad(i == n_purge_threads);
+#endif
+
+	/* Fetch and parse the UNDO records. The UNDO records are added
+	to a per purge node vector. */
+	thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+	ut_a(n_thrs > 0 && thr != NULL);
+
+	ut_ad(purge_sys.head <= purge_sys.tail);
+
+	i = 0;
+
+	const ulint		batch_size = srv_purge_batch_size;
+	std::unordered_map<table_id_t, purge_node_t*> table_id_map;
+	mem_heap_empty(purge_sys.heap);
+
+	while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
+		purge_node_t*		node;
+		trx_purge_rec_t		purge_rec;
+
+		ut_a(!thr->is_active);
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+
+		/* Track the max {trx_id, undo_no} for truncating the
+		UNDO logs once we have purged the records. */
+
+		if (purge_sys.head <= purge_sys.tail) {
+			purge_sys.head = purge_sys.tail;
+		}
+
+		/* Fetch the next record, and advance the purge_sys.tail. */
+		purge_rec.undo_rec = trx_purge_fetch_next_rec(
+			&purge_rec.roll_ptr, &n_pages_handled,
+			purge_sys.heap);
+
+		if (purge_rec.undo_rec == NULL) {
+			break;
+		} else if (purge_rec.undo_rec == &trx_purge_dummy_rec) {
+			continue;
+		}
+
+		table_id_t table_id = trx_undo_rec_get_table_id(
+			purge_rec.undo_rec);
+
+		purge_node_t *& table_node = table_id_map[table_id];
+
+		if (table_node) {
+			node = table_node;
+		} else {
+			thr = UT_LIST_GET_NEXT(thrs, thr);
+
+			if (!(++i % n_purge_threads)) {
+				thr = UT_LIST_GET_FIRST(
+					purge_sys.query->thrs);
+			}
+
+			ut_a(thr != NULL);
+			table_node = node;
+		}
+
+		node->undo_recs.push(purge_rec);
+
+		if (n_pages_handled >= batch_size) {
+			break;
+		}
+	}
+
+	ut_ad(purge_sys.head <= purge_sys.tail);
+
+	return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+	/* Determine how much data manipulation language (DML) statements
+	need to be delayed in order to reduce the lagging of the purge
+	thread. */
+	ulint	delay = 0; /* in microseconds; default: no delay */
+
+	/* If purge lag is set then calculate the new DML delay. */
+
+	if (srv_max_purge_lag > 0) {
+		double ratio = static_cast<double>(trx_sys.rseg_history_len) /
+			static_cast<double>(srv_max_purge_lag);
+
+		if (ratio > 1.0) {
+			/* If the history list length exceeds the
+			srv_max_purge_lag, the data manipulation
+			statements are delayed by at least 5000
+			microseconds. */
+			delay = (ulint) ((ratio - .5) * 10000);
+		}
+
+		if (delay > srv_max_purge_lag_delay) {
+			delay = srv_max_purge_lag_delay;
+		}
+
+		MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+	}
+
+	return(delay);
+}
+
+extern tpool::waitable_task purge_worker_task;
+
+/** Wait for pending purge jobs to complete. */
+static void trx_purge_wait_for_workers_to_complete()
+{
+  bool notify_wait = purge_worker_task.is_running();
+
+  if (notify_wait)
+   tpool::tpool_wait_begin();
+
+  purge_worker_task.wait();
+
+  if(notify_wait)
+    tpool::tpool_wait_end();
+
+  /* There should be no outstanding tasks as long
+  as the worker threads are active. */
+  ut_ad(srv_get_task_queue_length() == 0);
+}
+
+/**
+Run a purge batch.
+@param n_tasks   number of purge tasks to submit to the queue
+@param truncate  whether to truncate the history at the end of the batch
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, bool truncate)
+{
+	que_thr_t*	thr = NULL;
+	ulint		n_pages_handled;
+
+	ut_ad(n_tasks > 0);
+
+	srv_dml_needed_delay = trx_purge_dml_delay();
+
+	purge_sys.clone_oldest_view();
+
+#ifdef UNIV_DEBUG
+	if (srv_purge_view_update_only_debug) {
+		return(0);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Fetch the UNDO recs that need to be purged. */
+	n_pages_handled = trx_purge_attach_undo_recs(n_tasks);
+
+	/* Submit tasks to workers queue if using multi-threaded purge. */
+	for (ulint i = n_tasks; --i; ) {
+		thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+		ut_a(thr);
+		srv_que_task_enqueue_low(thr);
+		srv_thread_pool->submit_task(&purge_worker_task);
+	}
+
+	thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+
+	que_run_threads(thr);
+
+	trx_purge_wait_for_workers_to_complete();
+
+	if (truncate) {
+		trx_purge_truncate_history();
+	}
+
+	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
+
+	return(n_pages_handled);
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000..438dfcf9
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,2559 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+#include "row0mysql.h"
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
+const dtuple_t trx_undo_metadata = {
+	/* This also works for REC_INFO_METADATA_ALTER, because the
+	delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */
+	REC_INFO_METADATA_ADD, 0, 0,
+	NULL, 0, NULL
+#ifdef UNIV_DEBUG
+	, DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+};
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/** Calculate the free space left for extending an undo log record.
+@param undo_block    undo log page
+@param ptr           current end of the undo page
+@return bytes left */
+static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
+{
+  ut_ad(ptr >= &undo_block->frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+  /* The 10 is supposed to be an extra safety margin (and needed for
+  compatibility with older versions) */
+  lint left= srv_page_size - (ptr - undo_block->frame) -
+    (10 + FIL_PAGE_DATA_END);
+  ut_ad(left >= 0);
+  return left < 0 ? 0 : static_cast<ulint>(left);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+uint16_t
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+	buf_block_t*	undo_block,	/*!< in/out: undo log page */
+	byte*		ptr,		/*!< in: ptr up to where data has been
+					written on this undo page. */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  ut_ad(page_align(ptr) == undo_block->frame);
+
+  if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+    return 0;
+
+  byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+						TRX_UNDO_PAGE_FREE +
+						undo_block->frame);
+
+  const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
+
+  /* Write offset of the previous undo log record */
+  memcpy(ptr, ptr_to_first_free, 2);
+  ptr += 2;
+
+  const uint16_t end_of_rec= static_cast<uint16_t>(ptr - undo_block->frame);
+
+  /* Update the offset to first free undo record */
+  mach_write_to_2(ptr_to_first_free, end_of_rec);
+  /* Write offset of the next undo log record */
+  memcpy(undo_block->frame + first_free, ptr_to_first_free, 2);
+  const byte *start= undo_block->frame + first_free + 2;
+
+  mtr->undo_append(*undo_block, start, ptr - start - 2);
+  return first_free;
+}
+
+/** Virtual column undo log version. To distinguish it from a length value
+in 5.7.8 undo log, it starts with 0xF1 */
+static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1;
+
+/** Write virtual column index info (index id and column position in index)
+to the undo log
+@param[in,out]	undo_block	undo log page
+@param[in]	table           the table
+@param[in]	pos		the virtual column position
+@param[in]      ptr             undo log record being written
+@param[in]	first_v_col	whether this is the first virtual column
+				which could start with a version marker
+@return new undo log pointer */
+static
+byte*
+trx_undo_log_v_idx(
+	buf_block_t*		undo_block,
+	const dict_table_t*	table,
+	ulint			pos,
+	byte*			ptr,
+	bool			first_v_col)
+{
+	ut_ad(pos < table->n_v_def);
+	dict_v_col_t*	vcol = dict_table_get_nth_v_col(table, pos);
+	byte*		old_ptr;
+
+	ut_ad(!vcol->v_indexes.empty());
+
+	ulint		size = first_v_col ? 1 + 2 : 2;
+	const ulint	avail = trx_undo_left(undo_block, ptr);
+
+	/* The mach_write_compressed(ptr, flen) in
+	trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */
+	if (avail < size + 5) {
+		return(NULL);
+	}
+
+	ulint n_idx = 0;
+	for (const auto& v_index : vcol->v_indexes) {
+		n_idx++;
+		/* FIXME: index->id is 64 bits! */
+		size += mach_get_compressed_size(uint32_t(v_index.index->id));
+		size += mach_get_compressed_size(v_index.nth_field);
+	}
+
+	size += mach_get_compressed_size(n_idx);
+
+	if (avail < size + 5) {
+		return(NULL);
+	}
+
+	ut_d(const byte* orig_ptr = ptr);
+
+	if (first_v_col) {
+		/* write the version marker */
+		mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1);
+
+		ptr += 1;
+	}
+
+	old_ptr = ptr;
+
+	ptr += 2;
+
+	ptr += mach_write_compressed(ptr, n_idx);
+
+	for (const auto& v_index : vcol->v_indexes) {
+		ptr += mach_write_compressed(
+			/* FIXME: index->id is 64 bits! */
+			ptr, uint32_t(v_index.index->id));
+
+		ptr += mach_write_compressed(ptr, v_index.nth_field);
+	}
+
+	ut_ad(orig_ptr + size == ptr);
+
+	mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+	return(ptr);
+}
+
+/** Read virtual column index from undo log, and verify the column is still
+indexed, and return its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[out]	col_pos		the column number or FIL_NULL
+				if the column is not indexed any more
+@return remaining part of undo log record after reading these values */
+static
+const byte*
+trx_undo_read_v_idx_low(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	uint32_t*		col_pos)
+{
+	ulint		len = mach_read_from_2(ptr);
+	const byte*	old_ptr = ptr;
+
+	*col_pos = FIL_NULL;
+
+	ptr += 2;
+
+	ulint	num_idx = mach_read_next_compressed(&ptr);
+
+	ut_ad(num_idx > 0);
+
+	dict_index_t*	clust_index = dict_table_get_first_index(table);
+
+	for (ulint i = 0; i < num_idx; i++) {
+		index_id_t	id = mach_read_next_compressed(&ptr);
+		ulint		pos = mach_read_next_compressed(&ptr);
+		dict_index_t*	index = dict_table_get_next_index(clust_index);
+
+		while (index != NULL) {
+			/* Return if we find a matching index.
+			TODO: in the future, it might be worth to add
+			checks on other indexes */
+			if (index->id == id) {
+				const dict_col_t* col = dict_index_get_nth_col(
+					index, pos);
+				ut_ad(col->is_virtual());
+				const dict_v_col_t*	vcol = reinterpret_cast<
+					const dict_v_col_t*>(col);
+				*col_pos = vcol->v_pos;
+				return(old_ptr + len);
+			}
+
+			index = dict_table_get_next_index(index);
+		}
+	}
+
+	return(old_ptr + len);
+}
+
+/** Read virtual column index from undo log or online log if the log
+contains such info, and in the undo log case, verify the column is
+still indexed, and output its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in]	first_v_col	if this is the first virtual column, which
+				has the version marker
+@param[in,out]	is_undo_log	this function is used to parse both undo log,
+				and online log for virtual columns. So
+				check to see if this is undo log. When
+				first_v_col is true, is_undo_log is output,
+				when first_v_col is false, is_undo_log is input
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	bool			first_v_col,
+	bool*			is_undo_log,
+	uint32_t*		field_no)
+{
+	/* Version marker only put on the first virtual column */
+	if (first_v_col) {
+		/* Undo log has the virtual undo log marker */
+		*is_undo_log = (mach_read_from_1(ptr)
+				== VIRTUAL_COL_UNDO_FORMAT_1);
+
+		if (*is_undo_log) {
+			ptr += 1;
+		}
+	}
+
+	if (*is_undo_log) {
+		ptr = trx_undo_read_v_idx_low(table, ptr, field_no);
+	} else {
+		*field_no -= REC_MAX_N_FIELDS;
+	}
+
+	return(ptr);
+}
+
+/** Reports in the undo log of an insert of virtual columns.
+@param[in]	undo_block	undo log page
+@param[in]	table		the table
+@param[in]	row		dtuple contains the virtual columns
+@param[in,out]	ptr		log ptr
+@return true if write goes well, false if out of space */
+static
+bool
+trx_undo_report_insert_virtual(
+	buf_block_t*	undo_block,
+	dict_table_t*	table,
+	const dtuple_t*	row,
+	byte**		ptr)
+{
+	byte*	start = *ptr;
+	bool	first_v_col = true;
+
+	if (trx_undo_left(undo_block, *ptr) < 2) {
+		return(false);
+	}
+
+	/* Reserve 2 bytes to write the number
+	of bytes the stored fields take in this
+	undo record */
+	*ptr += 2;
+
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+	     col_no++) {
+		const dict_v_col_t*     col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		if (col->m_col.ord_part) {
+
+			/* make sure enought space to write the length */
+			if (trx_undo_left(undo_block, *ptr) < 5) {
+				return(false);
+			}
+
+			ulint   pos = col_no;
+			pos += REC_MAX_N_FIELDS;
+			*ptr += mach_write_compressed(*ptr, pos);
+
+			*ptr = trx_undo_log_v_idx(undo_block, table,
+						  col_no, *ptr, first_v_col);
+			first_v_col = false;
+
+			if (*ptr == NULL) {
+				return(false);
+			}
+
+			const dfield_t* vfield = dtuple_get_nth_v_field(
+				row, col->v_pos);
+			switch (ulint flen = vfield->len) {
+			case 0: case UNIV_SQL_NULL:
+				if (trx_undo_left(undo_block, *ptr) < 5) {
+					return(false);
+				}
+
+				*ptr += mach_write_compressed(*ptr, flen);
+				break;
+			default:
+				ulint	max_len
+					= dict_max_v_field_len_store_undo(
+						table, col_no);
+
+				if (flen > max_len) {
+					flen = max_len;
+				}
+
+				if (trx_undo_left(undo_block, *ptr)
+				    < flen + 5) {
+					return(false);
+				}
+				*ptr += mach_write_compressed(*ptr, flen);
+
+				memcpy(*ptr, vfield->data, flen);
+				*ptr += flen;
+			}
+		}
+	}
+
+	/* Always mark the end of the log with 2 bytes length field */
+	mach_write_to_2(start, ulint(*ptr - start));
+
+	return(true);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_insert(
+/*========================*/
+	buf_block_t*	undo_block,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: index entry which will be
+					inserted to the clustered index */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(index->is_primary());
+	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+	TRX_UNDO_INSERT == 1 into insert_undo pages,
+	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+			       + undo_block->frame) <= 2);
+
+	uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+					       (TRX_UNDO_PAGE_HDR
+						+ TRX_UNDO_PAGE_FREE
+						+ undo_block->frame));
+	byte* ptr = undo_block->frame + first_free;
+
+	if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
+		/* Not enough space for writing the general parameters */
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+	*ptr++ = TRX_UNDO_INSERT_REC;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the record
+	to be inserted in the clustered index */
+	if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
+		ut_ad(clust_entry->is_metadata());
+		ut_ad(index->is_instant());
+		ut_ad(undo_block->frame[first_free + 2]
+		      == TRX_UNDO_INSERT_REC);
+		undo_block->frame[first_free + 2] = TRX_UNDO_INSERT_METADATA;
+		goto done;
+	}
+
+	for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		const dfield_t*	field	= dtuple_get_nth_field(clust_entry, i);
+		ulint		flen	= dfield_get_len(field);
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		switch (flen) {
+		case 0: case UNIV_SQL_NULL:
+			break;
+		default:
+			if (trx_undo_left(undo_block, ptr) < flen) {
+
+				return(0);
+			}
+
+			memcpy(ptr, dfield_get_data(field), flen);
+			ptr += flen;
+		}
+	}
+
+	if (index->table->n_v_cols) {
+		if (!trx_undo_report_insert_virtual(
+			undo_block, index->table, clust_entry, &ptr)) {
+			return(0);
+		}
+	}
+
+done:
+	return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	ulint*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+{
+	const byte*	ptr;
+	ulint		type_cmpl;
+
+	ptr = undo_rec + 2;
+
+	type_cmpl = mach_read_from_1(ptr);
+	ptr++;
+
+	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+	ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+	ut_ad(*type <= TRX_UNDO_DEL_MARK_REC);
+	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+	*undo_no = mach_read_next_much_compressed(&ptr);
+	*table_id = mach_read_next_much_compressed(&ptr);
+	ut_ad(*table_id);
+
+	return(const_cast<byte*>(ptr));
+}
+
+/** Read from an undo log record a non-virtual column value.
+@param[in,out]	ptr		pointer to remaining part of the undo record
+@param[in,out]	field		stored field
+@param[in,out]	len		length of the field, or UNIV_SQL_NULL
+@param[in,out]	orig_len	original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_rec_get_col_val(
+	const byte*	ptr,
+	const byte**	field,
+	uint32_t*	len,
+	uint32_t*	orig_len)
+{
+	*len = mach_read_next_compressed(&ptr);
+	*orig_len = 0;
+
+	switch (*len) {
+	case UNIV_SQL_NULL:
+		*field = NULL;
+		break;
+	case UNIV_EXTERN_STORAGE_FIELD:
+		*orig_len = mach_read_next_compressed(&ptr);
+		*len = mach_read_next_compressed(&ptr);
+		*field = ptr;
+		ptr += *len & ~SPATIAL_STATUS_MASK;
+
+		ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_ad(*len > *orig_len);
+		/* @see dtuple_convert_big_rec() */
+		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* we do not have access to index->table here
+		ut_ad(dict_table_has_atomic_blobs(index->table)
+		      || *len >= col->max_prefix
+		      + BTR_EXTERN_FIELD_REF_SIZE);
+		*/
+
+		*len += UNIV_EXTERN_STORAGE_FIELD;
+		break;
+	default:
+		*field = ptr;
+		if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+			ptr += (*len - UNIV_EXTERN_STORAGE_FIELD)
+				& ~SPATIAL_STATUS_MASK;
+		} else {
+			ptr += *len;
+		}
+	}
+
+	return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t**ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	ulint		ref_len;
+	ulint		i;
+
+	ut_ad(index && ptr && ref && heap);
+	ut_a(dict_index_is_clust(index));
+
+	ref_len = dict_index_get_n_unique(index);
+
+	dtuple_t* tuple = dtuple_create(heap, ref_len);
+	*ref = tuple;
+
+	dict_index_copy_types(tuple, index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		const byte*	field;
+		uint32_t	len, orig_len;
+
+		dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+static
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, at the start of the row reference */
+	dict_index_t*	index)	/*!< in: clustered index */
+{
+	ulint	ref_len;
+	ulint	i;
+
+	ut_ad(index && ptr);
+	ut_a(dict_index_is_clust(index));
+
+	ref_len = dict_index_get_n_unique(index);
+
+	for (i = 0; i < ref_len; i++) {
+		const byte*	field;
+		uint32_t len, orig_len;
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+	}
+
+	return(ptr);
+}
+
+/** Fetch a prefix of an externally stored column, for writing to the undo
+log of an update or delete marking of a clustered index record.
+@param[out]	ext_buf		buffer to hold the prefix data and BLOB pointer
+@param[in]	prefix_len	prefix size to store in the undo log
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	field		an externally stored column
+@param[in,out]	len		input: length of field; output: used length of
+ext_buf
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+	byte*			ext_buf,
+	ulint			prefix_len,
+	ulint			zip_size,
+	const byte*		field,
+	ulint*			len)
+{
+	/* Fetch the BLOB. */
+	ulint	ext_len = btr_copy_externally_stored_field_prefix(
+		ext_buf, prefix_len, zip_size, field, *len);
+	/* BLOBs should always be nonempty. */
+	ut_a(ext_len);
+	/* Append the BLOB pointer to the prefix. */
+	memcpy(ext_buf + ext_len,
+	       field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+	       BTR_EXTERN_FIELD_REF_SIZE);
+	*len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+	return(ext_buf);
+}
+
+/** Writes to the undo log a prefix of an externally stored column.
+@param[out]	ptr		undo log position, at least 15 bytes must be
+available
+@param[out]	ext_buf		a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT()
+				size, or NULL when should not fetch a longer
+				prefix
+@param[in]	prefix_len	prefix size to store in the undo log
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	field		the locally stored part of the externally
+stored column
+@param[in,out]	len		length of field, in bytes
+@param[in]	spatial_status	whether the column is used by spatial index or
+				regular index
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+	byte*			ptr,
+	byte*			ext_buf,
+	ulint			prefix_len,
+	ulint			zip_size,
+	const byte**		field,
+	ulint*			len,
+	spatial_status_t	spatial_status)
+{
+	ulint	spatial_len= 0;
+
+	switch (spatial_status) {
+	case SPATIAL_UNKNOWN:
+	case SPATIAL_NONE:
+		break;
+
+	case SPATIAL_MIXED:
+	case SPATIAL_ONLY:
+		spatial_len = DATA_MBR_LEN;
+		break;
+	}
+
+	/* Encode spatial status into length. */
+	spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT;
+
+	if (spatial_status == SPATIAL_ONLY) {
+		/* If the column is only used by gis index, log its
+		MBR is enough.*/
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + spatial_len);
+
+		return(ptr);
+	}
+
+	if (ext_buf) {
+		ut_a(prefix_len > 0);
+
+		/* If an ordering column is externally stored, we will
+		have to store a longer prefix of the field.  In this
+		case, write to the log a marker followed by the
+		original length and the real length of the field. */
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+		ptr += mach_write_compressed(ptr, *len);
+
+		*field = trx_undo_page_fetch_ext(ext_buf, prefix_len,
+						 zip_size, *field, len);
+
+		ptr += mach_write_compressed(ptr, *len + spatial_len);
+	} else {
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + *len + spatial_len);
+	}
+
+	return(ptr);
+}
+
+/** Get MBR from a Geometry column stored externally
+@param[out]	mbr		MBR to fill
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	field		field contain the geometry data
+@param[in,out]	len		length of field, in bytes
+*/
+static
+void
+trx_undo_get_mbr_from_ext(
+/*======================*/
+	double*		mbr,
+	ulint		zip_size,
+	const byte*	field,
+	ulint*		len)
+{
+	uchar*		dptr = NULL;
+	ulint		dlen;
+	mem_heap_t*	heap = mem_heap_create(100);
+
+	dptr = btr_copy_externally_stored_field(
+		&dlen, field, zip_size, *len, heap);
+
+	if (dlen <= GEO_DATA_HEADER_SIZE) {
+		for (uint i = 0; i < SPDIMS; ++i) {
+			mbr[i * 2] = DBL_MAX;
+			mbr[i * 2 + 1] = -DBL_MAX;
+		}
+	} else {
+		rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+				   static_cast<uint>(dlen
+				   - GEO_DATA_HEADER_SIZE), SPDIMS, mbr);
+	}
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_modify(
+/*========================*/
+	buf_block_t*	undo_block,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index where update or
+					delete marking is done */
+	const rec_t*	rec,		/*!< in: clustered index record which
+					has NOT yet been modified */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,		/*!< in: update vector which tells the
+					columns to be updated; in the case of
+					a delete, this should be set to NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const dtuple_t*	row,		/*!< in: clustered index row contains
+					virtual column info */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(index->is_primary());
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+	TRX_UNDO_INSERT == 1 into insert_undo pages,
+	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+			       + undo_block->frame) <= 2);
+
+	byte* ptr_to_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+						       + TRX_UNDO_PAGE_FREE
+						       + undo_block->frame);
+
+	const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+	byte *ptr = undo_block->frame + first_free;
+
+	if (trx_undo_left(undo_block, ptr) < 50) {
+		/* NOTE: the value 50 must be big enough so that the general
+		fields written below fit on the undo log page */
+		return 0;
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	dict_table_t*	table		= index->table;
+	const byte*	field;
+	ulint		flen;
+	ulint		col_no;
+	ulint		type_cmpl;
+	byte*		type_cmpl_ptr;
+	ulint		i;
+	trx_id_t	trx_id;
+	ibool		ignore_prefix = FALSE;
+	byte		ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+				+ BTR_EXTERN_FIELD_REF_SIZE];
+	bool		first_v_col = true;
+
+	/* Store first some general parameters to the undo log */
+
+	if (!update) {
+		ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table)));
+		type_cmpl = TRX_UNDO_DEL_MARK_REC;
+	} else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing update_undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+		type_cmpl = TRX_UNDO_UPD_DEL_REC;
+		/* We are about to update a delete marked record.
+		We don't typically need the prefix in this case unless
+		the delete marking is done by the same transaction
+		(which we check below). */
+		ignore_prefix = TRUE;
+	} else {
+		type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+	}
+
+	type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+	type_cmpl_ptr = ptr;
+
+	*ptr++ = (byte) type_cmpl;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+
+	ptr += mach_u64_write_much_compressed(ptr, table->id);
+
+	/*----------------------------------------*/
+	/* Store the state of the info bits */
+
+	*ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+	/* Store the values of the system columns */
+	field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen);
+	ut_ad(flen == DATA_TRX_ID_LEN);
+
+	trx_id = trx_read_trx_id(field);
+
+	/* If it is an update of a delete marked record, then we are
+	allowed to ignore blob prefixes if the delete marking was done
+	by some other trx as it must have committed by now for us to
+	allow an over-write. */
+	if (trx_id == trx->id) {
+		ignore_prefix = false;
+	}
+	ptr += mach_u64_write_compressed(ptr, trx_id);
+
+	field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen);
+	ut_ad(flen == DATA_ROLL_PTR_LEN);
+	ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN));
+
+	ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field));
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the
+	record which will be modified in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		/* The ordering columns must not be instant added columns. */
+		ut_ad(!rec_offs_nth_default(offsets, i));
+		field = rec_get_nth_field(rec, offsets, i, &flen);
+
+		/* The ordering columns must not be stored externally. */
+		ut_ad(!rec_offs_nth_extern(offsets, i));
+		ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_block, ptr) < flen) {
+				return(0);
+			}
+
+			memcpy(ptr, field, flen);
+			ptr += flen;
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Save to the undo log the old values of the columns to be updated. */
+
+	if (update) {
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		ulint	n_updated = upd_get_n_fields(update);
+
+		/* If this is an online update while an inplace alter table
+		is in progress and the table has virtual column, we will
+		need to double check if there are any non-indexed columns
+		being registered in update vector in case they will be indexed
+		in new table */
+		if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) {
+			for (i = 0; i < upd_get_n_fields(update); i++) {
+				upd_field_t*	fld = upd_get_nth_field(
+					update, i);
+				ulint		pos = fld->field_no;
+
+				/* These columns must not have an index
+				on them */
+				if (upd_fld_is_virtual_col(fld)
+				    && dict_table_get_nth_v_col(
+					    table, pos)->v_indexes.empty()) {
+					n_updated--;
+				}
+			}
+		}
+
+		i = 0;
+
+		if (UNIV_UNLIKELY(update->is_alter_metadata())) {
+			ut_ad(update->n_fields >= 1);
+			ut_ad(!upd_fld_is_virtual_col(&update->fields[0]));
+			ut_ad(update->fields[0].field_no
+			      == index->first_user_field());
+			ut_ad(!dfield_is_ext(&update->fields[0].new_val));
+			ut_ad(!dfield_is_null(&update->fields[0].new_val));
+			/* The instant ADD COLUMN metadata record does not
+			contain the BLOB. Do not write anything for it. */
+			i = !rec_is_alter_metadata(rec, *index);
+			n_updated -= i;
+		}
+
+		ptr += mach_write_compressed(ptr, n_updated);
+
+		for (; i < upd_get_n_fields(update); i++) {
+			if (trx_undo_left(undo_block, ptr) < 5) {
+				return 0;
+			}
+
+			upd_field_t*	fld = upd_get_nth_field(update, i);
+
+			bool	is_virtual = upd_fld_is_virtual_col(fld);
+			ulint	max_v_log_len = 0;
+
+			ulint pos = fld->field_no;
+			const dict_col_t* col = NULL;
+
+			if (is_virtual) {
+				/* Skip the non-indexed column, during
+				an online alter table */
+				if (dict_index_is_online_ddl(index)
+				    && dict_table_get_nth_v_col(
+					table, pos)->v_indexes.empty()) {
+					continue;
+				}
+
+				/* add REC_MAX_N_FIELDS to mark this
+				is a virtual col */
+				ptr += mach_write_compressed(
+					ptr, pos + REC_MAX_N_FIELDS);
+
+				if (trx_undo_left(undo_block, ptr) < 15) {
+					return 0;
+				}
+
+				ut_ad(fld->field_no < table->n_v_def);
+
+				ptr = trx_undo_log_v_idx(undo_block, table,
+							 fld->field_no, ptr,
+							 first_v_col);
+				if (ptr == NULL) {
+					 return(0);
+				}
+				first_v_col = false;
+
+				max_v_log_len
+					= dict_max_v_field_len_store_undo(
+						table, fld->field_no);
+
+				field = static_cast<byte*>(
+					fld->old_v_val->data);
+				flen = fld->old_v_val->len;
+
+				/* Only log sufficient bytes for index
+				record update */
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				goto store_len;
+			}
+
+			if (UNIV_UNLIKELY(update->is_metadata())) {
+				ut_ad(pos >= index->first_user_field());
+				ut_ad(rec_is_metadata(rec, *index));
+
+				if (rec_is_alter_metadata(rec, *index)) {
+					ut_ad(update->is_alter_metadata());
+
+					field = rec_offs_n_fields(offsets)
+						> pos
+						&& !rec_offs_nth_default(
+							offsets, pos)
+						? rec_get_nth_field(
+							rec, offsets,
+							pos, &flen)
+						: index->instant_field_value(
+							pos - 1, &flen);
+
+					if (pos == index->first_user_field()) {
+						ut_ad(rec_offs_nth_extern(
+							offsets, pos));
+						ut_ad(flen == FIELD_REF_SIZE);
+						goto write_field;
+					}
+					col = dict_index_get_nth_col(index,
+								     pos - 1);
+				} else if (!update->is_alter_metadata()) {
+					goto get_field;
+				} else {
+					/* We are converting an ADD COLUMN
+					metadata record to an ALTER TABLE
+					metadata record, with BLOB. Subtract
+					the missing metadata BLOB field. */
+					ut_ad(pos > index->first_user_field());
+					--pos;
+					goto get_field;
+				}
+			} else {
+get_field:
+				col = dict_index_get_nth_col(index, pos);
+				field = rec_get_nth_cfield(
+					rec, index, offsets, pos, &flen);
+			}
+write_field:
+			/* Write field number to undo log */
+			ptr += mach_write_compressed(ptr, pos);
+
+			if (trx_undo_left(undo_block, ptr) < 15) {
+				return 0;
+			}
+
+			if (rec_offs_n_fields(offsets) > pos
+			    && rec_offs_nth_extern(offsets, pos)) {
+				ut_ad(col || pos == index->first_user_field());
+				ut_ad(col || update->is_alter_metadata());
+				ut_ad(col
+				      || rec_is_alter_metadata(rec, *index));
+				ulint prefix_len = col
+					? dict_max_field_len_store_undo(
+						table, col)
+					: 0;
+
+				ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+				      <= sizeof ext_buf);
+
+				ptr = trx_undo_page_report_modify_ext(
+					ptr,
+					col
+					&& col->ord_part
+					&& !ignore_prefix
+					&& flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+					? ext_buf : NULL, prefix_len,
+					table->space->zip_size(),
+					&field, &flen, SPATIAL_UNKNOWN);
+
+				*type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+			} else {
+store_len:
+				ptr += mach_write_compressed(ptr, flen);
+			}
+
+			if (flen != UNIV_SQL_NULL) {
+				if (trx_undo_left(undo_block, ptr) < flen) {
+					return(0);
+				}
+
+				memcpy(ptr, field, flen);
+				ptr += flen;
+			}
+
+			/* Also record the new value for virtual column */
+			if (is_virtual) {
+				field = static_cast<byte*>(fld->new_val.data);
+				flen = fld->new_val.len;
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				if (trx_undo_left(undo_block, ptr) < 15) {
+					return(0);
+				}
+
+				ptr += mach_write_compressed(ptr, flen);
+
+				if (flen != UNIV_SQL_NULL) {
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+	}
+
+	/* Reset the first_v_col, so to put the virtual column undo
+	version marker again, when we log all the indexed columns */
+	first_v_col = true;
+
+	/*----------------------------------------*/
+	/* In the case of a delete marking, and also in the case of an update
+	where any ordering field of any index changes, store the values of all
+	columns which occur as ordering fields in any index. This info is used
+	in the purge of old versions where we use it to build and search the
+	delete marked index records, to look if we can remove them from the
+	index tree. Note that starting from 4.0.14 also externally stored
+	fields can be ordering in some index. Starting from 5.2, we no longer
+	store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+	but we can construct the column prefix fields in the index by
+	fetching the first page of the BLOB that is pointed to by the
+	clustered index. This works also in crash recovery, because all pages
+	(including BLOBs) are recovered before anything is rolled back. */
+
+	if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		byte*		old_ptr = ptr;
+		double		mbr[SPDIMS * 2];
+		mem_heap_t*	row_heap = NULL;
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		/* Reserve 2 bytes to write the number of bytes the stored
+		fields take in this undo record */
+
+		ptr += 2;
+
+		for (col_no = 0; col_no < dict_table_get_n_cols(table);
+		     col_no++) {
+
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, col_no);
+
+			if (!col->ord_part) {
+				continue;
+			}
+
+			const ulint pos = dict_index_get_nth_col_pos(
+				index, col_no, NULL);
+			/* All non-virtual columns must be present in
+			the clustered index. */
+			ut_ad(pos != ULINT_UNDEFINED);
+
+			const bool is_ext = rec_offs_nth_extern(offsets, pos);
+			const spatial_status_t spatial_status = is_ext
+				? dict_col_get_spatial_status(col)
+				: SPATIAL_NONE;
+
+			switch (spatial_status) {
+			case SPATIAL_UNKNOWN:
+				ut_ad(0);
+				/* fall through */
+			case SPATIAL_MIXED:
+			case SPATIAL_ONLY:
+				/* Externally stored spatially indexed
+				columns will be (redundantly) logged
+				again, because we did not write the
+				MBR yet, that is, the previous call to
+				trx_undo_page_report_modify_ext()
+				was with SPATIAL_UNKNOWN. */
+				break;
+			case SPATIAL_NONE:
+				if (!update) {
+					/* This is a DELETE operation. */
+					break;
+				}
+				/* Avoid redundantly logging indexed
+				columns that were updated. */
+
+				for (i = 0; i < update->n_fields; i++) {
+					const ulint field_no
+						= upd_get_nth_field(update, i)
+						->field_no;
+					if (field_no >= index->n_fields
+					    || dict_index_get_nth_field(
+						    index, field_no)->col
+					    == col) {
+						goto already_logged;
+					}
+				}
+			}
+
+			if (true) {
+				/* Write field number to undo log */
+				if (trx_undo_left(undo_block, ptr) < 5 + 15) {
+					return(0);
+				}
+
+				ptr += mach_write_compressed(ptr, pos);
+
+				/* Save the old value of field */
+				field = rec_get_nth_cfield(
+					rec, index, offsets, pos, &flen);
+
+				if (is_ext) {
+					const dict_col_t*	col =
+						dict_index_get_nth_col(
+							index, pos);
+					ulint			prefix_len =
+						dict_max_field_len_store_undo(
+							table, col);
+
+					ut_a(prefix_len < sizeof ext_buf);
+					const ulint zip_size
+						= table->space->zip_size();
+
+					/* If there is a spatial index on it,
+					log its MBR */
+					if (spatial_status != SPATIAL_NONE) {
+						ut_ad(DATA_GEOMETRY_MTYPE(
+								col->mtype));
+
+						trx_undo_get_mbr_from_ext(
+							mbr, zip_size,
+							field, &flen);
+					}
+
+					ptr = trx_undo_page_report_modify_ext(
+						ptr,
+						flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+						&& !ignore_prefix
+						? ext_buf : NULL, prefix_len,
+						zip_size,
+						&field, &flen,
+						spatial_status);
+				} else {
+					ptr += mach_write_compressed(
+						ptr, flen);
+				}
+
+				if (flen != UNIV_SQL_NULL
+				    && spatial_status != SPATIAL_ONLY) {
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+
+				if (spatial_status != SPATIAL_NONE) {
+					if (trx_undo_left(undo_block, ptr)
+					    < DATA_MBR_LEN) {
+						return(0);
+					}
+
+					for (int i = 0; i < SPDIMS * 2;
+					     i++) {
+						mach_double_write(
+							ptr, mbr[i]);
+						ptr +=  sizeof(double);
+					}
+				}
+			}
+
+already_logged:
+			continue;
+		}
+
+		for (col_no = 0; col_no < dict_table_get_n_v_cols(table);
+		     col_no++) {
+			const dict_v_col_t*     col
+				= dict_table_get_nth_v_col(table, col_no);
+
+			if (col->m_col.ord_part) {
+				ulint   pos = col_no;
+				ulint	max_v_log_len
+					= dict_max_v_field_len_store_undo(
+						table, pos);
+
+				/* Write field number to undo log.
+				Make sure there is enought space in log */
+				if (trx_undo_left(undo_block, ptr) < 5) {
+					return(0);
+				}
+
+				pos += REC_MAX_N_FIELDS;
+				ptr += mach_write_compressed(ptr, pos);
+
+				ut_ad(col_no < table->n_v_def);
+				ptr = trx_undo_log_v_idx(undo_block, table,
+							 col_no, ptr,
+							 first_v_col);
+				first_v_col = false;
+
+				if (!ptr) {
+					 return(0);
+				}
+
+				const dfield_t* vfield = NULL;
+
+				if (update) {
+					ut_ad(!row);
+					if (update->old_vrow == NULL) {
+						flen = UNIV_SQL_NULL;
+					} else {
+						vfield = dtuple_get_nth_v_field(
+							update->old_vrow,
+							col->v_pos);
+					}
+				} else if (row) {
+					vfield = dtuple_get_nth_v_field(
+						row, col->v_pos);
+				} else {
+					ut_ad(0);
+				}
+
+				if (vfield) {
+					field = static_cast<byte*>(vfield->data);
+					flen = vfield->len;
+				} else {
+					ut_ad(flen == UNIV_SQL_NULL);
+				}
+
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				ptr += mach_write_compressed(ptr, flen);
+
+				switch (flen) {
+				case 0: case UNIV_SQL_NULL:
+					break;
+				default:
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+
+		mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+		if (row_heap) {
+			mem_heap_free(row_heap);
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Write pointers to the previous and the next undo log records */
+	if (trx_undo_left(undo_block, ptr) < 2) {
+		return(0);
+	}
+
+	mach_write_to_2(ptr, first_free);
+	const uint16_t new_free = static_cast<uint16_t>(
+		ptr + 2 - undo_block->frame);
+	mach_write_to_2(undo_block->frame + first_free, new_free);
+
+	mach_write_to_2(ptr_to_first_free, new_free);
+
+	const byte* start = &undo_block->frame[first_free + 2];
+	mtr->undo_append(*undo_block, start, ptr - start);
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	const byte*	ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	byte*		info_bits)	/*!< out: info bits state */
+{
+	/* Read the state of the info bits */
+	*info_bits = *ptr++;
+
+	/* Read the values of the system columns */
+
+	*trx_id = mach_u64_read_next_compressed(&ptr);
+	*roll_ptr = mach_u64_read_next_compressed(&ptr);
+
+	return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	const byte*	ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undo record */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd)	/*!< out, own: update vector */
+{
+	upd_field_t*	upd_field;
+	upd_t*		update;
+	ulint		n_fields;
+	byte*		buf;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+	ulint		n_skip_field = 0;
+
+	ut_a(dict_index_is_clust(index));
+
+	if (type != TRX_UNDO_DEL_MARK_REC) {
+		n_fields = mach_read_next_compressed(&ptr);
+	} else {
+		n_fields = 0;
+	}
+
+	*upd = update = upd_create(n_fields + 2, heap);
+
+	update->info_bits = info_bits;
+
+	/* Store first trx id and roll ptr to update vector */
+
+	upd_field = upd_get_nth_field(update, n_fields);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+	mach_write_to_6(buf, trx_id);
+
+	upd_field_set_field_no(upd_field, index->db_trx_id(), index);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+	upd_field = upd_get_nth_field(update, n_fields + 1);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+	trx_write_roll_ptr(buf, roll_ptr);
+
+	upd_field_set_field_no(upd_field, index->db_roll_ptr(), index);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+	/* Store then the updated ordinary columns to the update vector */
+
+	for (ulint i = 0; i < n_fields; i++) {
+		const byte* field;
+		uint32_t len, orig_len;
+
+		upd_field = upd_get_nth_field(update, i);
+		uint32_t field_no = mach_read_next_compressed(&ptr);
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			/* If new version, we need to check index list to figure
+			out the correct virtual column position */
+			ptr = trx_undo_read_v_idx(
+				index->table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+			/* This column could be dropped or no longer indexed */
+			if (field_no >= index->n_fields) {
+				/* Mark this is no longer needed */
+				upd_field->field_no = REC_MAX_N_FIELDS;
+
+				ptr = trx_undo_rec_get_col_val(
+					ptr, &field, &len, &orig_len);
+				ptr = trx_undo_rec_get_col_val(
+					ptr, &field, &len, &orig_len);
+				n_skip_field++;
+				continue;
+			}
+
+			upd_field_set_v_field_no(
+				upd_field, static_cast<uint16_t>(field_no),
+				index);
+		} else if (UNIV_UNLIKELY((update->info_bits
+					  & ~REC_INFO_DELETED_FLAG)
+					 == REC_INFO_MIN_REC_FLAG)) {
+			ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
+			const uint32_t uf = index->first_user_field();
+			ut_ad(field_no >= uf);
+
+			if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
+				/* Generic instant ALTER TABLE */
+				if (field_no == uf) {
+					upd_field->new_val.type
+						.metadata_blob_init();
+				} else if (field_no >= index->n_fields) {
+					/* This is reachable during
+					purge if the table was emptied
+					and converted to the canonical
+					format on a later ALTER TABLE.
+					In this case,
+					row_purge_upd_exist_or_extern()
+					would only be interested in
+					freeing any BLOBs that were
+					updated, that is, the metadata
+					BLOB above.  Other BLOBs in
+					the metadata record are never
+					updated; they are for the
+					initial DEFAULT values of the
+					instantly added columns, and
+					they will never change.
+
+					Note: if the table becomes
+					empty during ROLLBACK or is
+					empty during subsequent ALTER
+					TABLE, and btr_page_empty() is
+					called to re-create the root
+					page without the metadata
+					record, in that case we should
+					only free the latest version
+					of BLOBs in the record,
+					which purge would never touch. */
+					field_no = REC_MAX_N_FIELDS;
+					n_skip_field++;
+				} else {
+					dict_col_copy_type(
+						dict_index_get_nth_col(
+							index, field_no - 1),
+						&upd_field->new_val.type);
+				}
+			} else {
+				/* Instant ADD COLUMN...LAST */
+				dict_col_copy_type(
+					dict_index_get_nth_col(index,
+							       field_no),
+					&upd_field->new_val.type);
+			}
+			upd_field->field_no = field_no
+				& dict_index_t::MAX_N_FIELDS;
+		} else if (field_no < index->n_fields) {
+			upd_field_set_field_no(upd_field,
+					       static_cast<uint16_t>(field_no),
+					       index);
+		} else {
+			ib::error() << "Trying to access update undo rec"
+				" field " << field_no
+				<< " in index " << index->name
+				<< " of table " << index->table->name
+				<< " but index has only "
+				<< dict_index_get_n_fields(index)
+				<< " fields " << BUG_REPORT_MSG
+				<< ". Run also CHECK TABLE "
+				<< index->table->name << "."
+				" n_fields = " << n_fields << ", i = " << i;
+
+			ut_ad(0);
+			*upd = NULL;
+			return(NULL);
+		}
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		upd_field->orig_len = static_cast<uint16_t>(orig_len);
+
+		if (len == UNIV_SQL_NULL) {
+			dfield_set_null(&upd_field->new_val);
+		} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_data(&upd_field->new_val, field, len);
+		} else {
+			len -= UNIV_EXTERN_STORAGE_FIELD;
+
+			dfield_set_data(&upd_field->new_val, field, len);
+			dfield_set_ext(&upd_field->new_val);
+		}
+
+		ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG
+					    | REC_INFO_MIN_REC_FLAG)
+		      || field_no != index->first_user_field()
+		      || (upd_field->new_val.ext
+			  && upd_field->new_val.len == FIELD_REF_SIZE));
+
+		if (is_virtual) {
+			upd_field->old_v_val = static_cast<dfield_t*>(
+				mem_heap_alloc(
+					heap, sizeof *upd_field->old_v_val));
+			ptr = trx_undo_rec_get_col_val(
+				ptr, &field, &len, &orig_len);
+	                if (len == UNIV_SQL_NULL) {
+				dfield_set_null(upd_field->old_v_val);
+			} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+				dfield_set_data(
+					upd_field->old_v_val, field, len);
+			} else {
+				ut_ad(0);
+			}
+		}
+	}
+
+	/* We may have to skip dropped indexed virtual columns.
+	Also, we may have to trim the update vector of a metadata record
+	if dict_index_t::clear_instant_alter() was invoked on the table
+	later, and the number of fields no longer matches. */
+
+	if (n_skip_field) {
+		upd_field_t* d = upd_get_nth_field(update, 0);
+		const upd_field_t* const end = d + n_fields + 2;
+
+		for (const upd_field_t* s = d; s != end; s++) {
+			if (s->field_no != REC_MAX_N_FIELDS) {
+				*d++ = *s;
+			}
+		}
+
+		ut_ad(d + n_skip_field == end);
+		update->n_fields = d - upd_get_nth_field(update, 0);
+	}
+
+	return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+@return pointer to remaining part of undo record */
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+	const byte*	ptr,	/*!< in: remaining part in update undo log
+				record of a suitable type, at the start of
+				the stored index columns;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the partial row is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*	update,	/*!< in: updated columns */
+	dtuple_t**	row,	/*!< out, own: partial row */
+	ibool		ignore_prefix, /*!< in: flag to indicate if we
+				expect blob prefixes in undo. Used
+				only in the assertion. */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	const byte*	end_ptr;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+
+	ut_ad(index->is_primary());
+
+	*row = dtuple_create_with_vcol(
+		heap, dict_table_get_n_cols(index->table),
+		dict_table_get_n_v_cols(index->table));
+
+	/* Mark all columns in the row uninitialized, so that
+	we can distinguish missing fields from fields that are SQL NULL. */
+	for (ulint i = 0; i < dict_table_get_n_cols(index->table); i++) {
+		dfield_get_type(dtuple_get_nth_field(*row, i))
+			->mtype = DATA_MISSING;
+	}
+
+	dtuple_init_v_fld(*row);
+
+	for (const upd_field_t* uf = update->fields, * const ue
+		     = update->fields + update->n_fields;
+	     uf != ue; uf++) {
+		if (uf->old_v_val) {
+			continue;
+		}
+		const dict_col_t& c = *dict_index_get_nth_col(index,
+							      uf->field_no);
+		if (!c.is_dropped()) {
+			*dtuple_get_nth_field(*row, c.ind) = uf->new_val;
+		}
+	}
+
+	end_ptr = ptr + mach_read_from_2(ptr);
+	ptr += 2;
+
+	while (ptr != end_ptr) {
+		dfield_t*	dfield;
+		const byte*	field;
+		uint32_t	field_no;
+		const dict_col_t* col;
+		uint32_t len, orig_len;
+
+		field_no = mach_read_next_compressed(&ptr);
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			ptr = trx_undo_read_v_idx(
+				index->table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+		}
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		/* This column could be dropped or no longer indexed */
+		if (field_no == FIL_NULL) {
+			ut_ad(is_virtual);
+			continue;
+		}
+
+		if (is_virtual) {
+			dict_v_col_t* vcol = dict_table_get_nth_v_col(
+						index->table, field_no);
+			col = &vcol->m_col;
+			dfield = dtuple_get_nth_v_field(*row, vcol->v_pos);
+			dict_col_copy_type(
+				&vcol->m_col,
+				dfield_get_type(dfield));
+		} else {
+			col = dict_index_get_nth_col(index, field_no);
+
+			if (col->is_dropped()) {
+				continue;
+			}
+
+			dfield = dtuple_get_nth_field(*row, col->ind);
+			ut_ad(dfield->type.mtype == DATA_MISSING
+			      || dict_col_type_assert_equal(col,
+							    &dfield->type));
+			ut_ad(dfield->type.mtype == DATA_MISSING
+			      || dfield->len == len
+			      || (len != UNIV_SQL_NULL
+				  && len >= UNIV_EXTERN_STORAGE_FIELD));
+			dict_col_copy_type(col, dfield_get_type(dfield));
+		}
+
+		dfield_set_data(dfield, field, len);
+
+		if (len != UNIV_SQL_NULL
+		    && len >= UNIV_EXTERN_STORAGE_FIELD) {
+			spatial_status_t	spatial_status;
+
+			/* Decode spatial status. */
+			spatial_status = static_cast<spatial_status_t>(
+				(len & SPATIAL_STATUS_MASK)
+				>> SPATIAL_STATUS_SHIFT);
+			len &= ~SPATIAL_STATUS_MASK;
+
+			/* Keep compatible with 5.7.9 format. */
+			if (spatial_status == SPATIAL_UNKNOWN) {
+				spatial_status =
+					dict_col_get_spatial_status(col);
+			}
+
+			switch (spatial_status) {
+			case SPATIAL_ONLY:
+				ut_ad(len - UNIV_EXTERN_STORAGE_FIELD
+				      == DATA_MBR_LEN);
+				dfield_set_len(
+					dfield,
+					len - UNIV_EXTERN_STORAGE_FIELD);
+				break;
+
+			case SPATIAL_MIXED:
+				dfield_set_len(
+					dfield,
+					len - UNIV_EXTERN_STORAGE_FIELD
+					- DATA_MBR_LEN);
+				break;
+
+			case SPATIAL_NONE:
+				dfield_set_len(
+					dfield,
+					len - UNIV_EXTERN_STORAGE_FIELD);
+				break;
+
+			case SPATIAL_UNKNOWN:
+				ut_ad(0);
+				break;
+			}
+
+			dfield_set_ext(dfield);
+			dfield_set_spatial_status(dfield, spatial_status);
+
+			/* If the prefix of this column is indexed,
+			ensure that enough prefix is stored in the
+			undo log record. */
+			if (!ignore_prefix && col->ord_part
+			    && spatial_status != SPATIAL_ONLY) {
+				ut_a(dfield_get_len(dfield)
+				     >= BTR_EXTERN_FIELD_REF_SIZE);
+				ut_a(dict_table_has_atomic_blobs(index->table)
+				     || dfield_get_len(dfield)
+				     >= REC_ANTELOPE_MAX_INDEX_COL_LEN
+				     + BTR_EXTERN_FIELD_REF_SIZE);
+			}
+		}
+	}
+
+	return(const_cast<byte*>(ptr));
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@param[in,out]	block	undo page
+@param[in,out]	mtr	mini-transaction
+@return	byte offset of the undo log record
+@retval	0	in case of failure */
+static
+uint16_t
+trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
+			    buf_block_t* block, mtr_t* mtr)
+{
+	byte*	ptr_first_free  = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+						       + TRX_UNDO_PAGE_FREE
+						       + block->frame);
+	const uint16_t first_free = mach_read_from_2(ptr_first_free);
+	ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+	ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+	byte* const start = block->frame + first_free;
+	size_t len = strlen(table->name.m_name);
+	const size_t fixed = 2 + 1 + 11 + 11 + 2;
+	ut_ad(len <= NAME_LEN * 2 + 1);
+	/* The -10 is used in trx_undo_left() */
+	compile_time_assert((NAME_LEN * 1) * 2 + fixed
+			    + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
+			    < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
+
+	if (trx_undo_left(block, start) < fixed + len) {
+		ut_ad(first_free > TRX_UNDO_PAGE_HDR
+		      + TRX_UNDO_PAGE_HDR_SIZE);
+		return 0;
+	}
+
+	byte* ptr = start + 2;
+	*ptr++ = TRX_UNDO_RENAME_TABLE;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_u64_write_much_compressed(ptr, table->id);
+	memcpy(ptr, table->name.m_name, len);
+	ptr += len;
+	mach_write_to_2(ptr, first_free);
+	mach_write_to_2(ptr_first_free, ptr + 2 - block->frame);
+	memcpy(start, ptr_first_free, 2);
+	mtr->undo_append(*block, start + 2, ptr - start - 2);
+	return first_free;
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@return	DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+{
+	ut_ad(!trx->read_only);
+	ut_ad(trx->id);
+	ut_ad(!table->is_temporary());
+
+	mtr_t		mtr;
+	dberr_t		err;
+	mtr.start();
+	if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) {
+		trx_undo_t*	undo = trx->rsegs.m_redo.undo;
+		ut_ad(err == DB_SUCCESS);
+		ut_ad(undo);
+		for (ut_d(int loop_count = 0);;) {
+			ut_ad(loop_count++ < 2);
+			ut_ad(undo->last_page_no
+			      == block->page.id().page_no());
+
+			if (uint16_t offset = trx_undo_page_report_rename(
+				    trx, table, block, &mtr)) {
+				undo->top_page_no = undo->last_page_no;
+				undo->top_offset  = offset;
+				undo->top_undo_no = trx->undo_no++;
+				undo->guess_block = block;
+				ut_ad(!undo->empty());
+
+				err = DB_SUCCESS;
+				break;
+			} else {
+				mtr.commit();
+				mtr.start();
+				block = trx_undo_add_page(undo, &mtr);
+				if (!block) {
+					err = DB_OUT_OF_FILE_SPACE;
+					break;
+				}
+			}
+		}
+	}
+
+	mtr.commit();
+	return err;
+}
+
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index; in updates,
+					may contain a clustered index
+					record tuple that also contains
+					virtual columns of the table;
+					otherwise, NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index; NULL if insert */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: DB_ROLL_PTR to the
+					undo log record */
+{
+	trx_t*		trx;
+	mtr_t		mtr;
+#ifdef UNIV_DEBUG
+	int		loop_count	= 0;
+#endif /* UNIV_DEBUG */
+
+	ut_a(dict_index_is_clust(index));
+	ut_ad(!update || rec);
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!srv_read_only_mode);
+
+	trx = thr_get_trx(thr);
+	/* This function must not be invoked during rollback
+	(of a TRX_STATE_PREPARE transaction or otherwise). */
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(!trx->in_rollback);
+
+	mtr.start();
+	trx_undo_t**	pundo;
+	trx_rseg_t*	rseg;
+	const bool	is_temp	= index->table->is_temporary();
+
+	if (is_temp) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+		rseg = trx->get_temp_rseg();
+		pundo = &trx->rsegs.m_noredo.undo;
+	} else {
+		ut_ad(!trx->read_only);
+		ut_ad(trx->id);
+		pundo = &trx->rsegs.m_redo.undo;
+		rseg = trx->rsegs.m_redo.rseg;
+	}
+
+	dberr_t		err;
+	buf_block_t*	undo_block = trx_undo_assign_low(trx, rseg, pundo,
+							 &err, &mtr);
+	trx_undo_t*	undo	= *pundo;
+
+	ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
+	if (UNIV_UNLIKELY(undo_block == NULL)) {
+		goto err_exit;
+	}
+
+	ut_ad(undo != NULL);
+
+	do {
+		uint16_t offset = !rec
+			? trx_undo_page_report_insert(
+				undo_block, trx, index, clust_entry, &mtr)
+			: trx_undo_page_report_modify(
+				undo_block, trx, index, rec, offsets, update,
+				cmpl_info, clust_entry, &mtr);
+
+		if (UNIV_UNLIKELY(offset == 0)) {
+			const uint16_t first_free = mach_read_from_2(
+				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				+ undo_block->frame);
+			memset(undo_block->frame + first_free, 0,
+			       (srv_page_size - FIL_PAGE_DATA_END)
+			       - first_free);
+
+			if (first_free
+			    == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+				/* The record did not fit on an empty
+				undo page. Discard the freshly allocated
+				page and return an error. */
+
+				/* When we remove a page from an undo
+				log, this is analogous to a
+				pessimistic insert in a B-tree, and we
+				must reserve the counterpart of the
+				tree latch, which is the rseg
+				mutex. We must commit the mini-transaction
+				first, because it may be holding lower-level
+				latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
+
+				mtr.commit();
+				mtr.start();
+				if (is_temp) {
+					mtr.set_log_mode(MTR_LOG_NO_REDO);
+				}
+
+				mutex_enter(&rseg->mutex);
+				trx_undo_free_last_page(undo, &mtr);
+				mutex_exit(&rseg->mutex);
+
+				err = DB_UNDO_RECORD_TOO_BIG;
+				goto err_exit;
+			} else {
+				/* Write log for clearing the unused
+				tail of the undo page. It might
+				contain some garbage from a previously
+				written record, and mtr_t::write()
+				will optimize away writes of unchanged
+				bytes. Failure to write this caused a
+				recovery failure when we avoided
+				reading the undo log page from the
+				data file and initialized it based on
+				redo log records (which included the
+				write of the previous garbage). */
+				mtr.memset(*undo_block, first_free,
+					   srv_page_size - first_free
+					   - FIL_PAGE_DATA_END, 0);
+			}
+
+			mtr.commit();
+		} else {
+			/* Success */
+			undo->top_page_no = undo_block->page.id().page_no();
+			mtr.commit();
+			undo->top_offset  = offset;
+			undo->top_undo_no = trx->undo_no++;
+			undo->guess_block = undo_block;
+			ut_ad(!undo->empty());
+
+			if (!is_temp) {
+				const undo_no_t limit = undo->top_undo_no;
+				/* Determine if this is the first time
+				when this transaction modifies a
+				system-versioned column in this table. */
+				trx_mod_table_time_t& time
+					= trx->mod_tables.insert(
+						trx_mod_tables_t::value_type(
+							index->table, limit))
+					.first->second;
+				ut_ad(time.valid(limit));
+
+				if (!time.is_versioned()
+				    && index->table->versioned_by_id()
+				    && (!rec /* INSERT */
+					|| (update
+					    && update->affects_versioned()))) {
+					time.set_versioned(limit);
+				}
+			}
+
+			*roll_ptr = trx_undo_build_roll_ptr(
+				!rec, rseg->id, undo->top_page_no, offset);
+			return(DB_SUCCESS);
+		}
+
+		ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
+
+		/* We have to extend the undo log by one page */
+
+		ut_ad(++loop_count < 2);
+		mtr.start();
+
+		if (is_temp) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		}
+
+		undo_block = trx_undo_add_page(undo, &mtr);
+
+		DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
+				undo_block = NULL;);
+	} while (UNIV_LIKELY(undo_block != NULL));
+
+	ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+		DB_OUT_OF_FILE_SPACE,
+		//ER_INNODB_UNDO_LOG_FULL,
+		"No more space left over in %s tablespace for allocating UNDO"
+		" log pages. Please add new data file to the tablespace or"
+		" check if filesystem is full or enable auto-extension for"
+		" the tablespace",
+		undo->rseg->space == fil_system.sys_space
+		? "system" : is_temp ? "temporary" : "undo");
+
+	/* Did not succeed: out of space */
+	err = DB_OUT_OF_FILE_SPACE;
+
+err_exit:
+	mtr_commit(&mtr);
+	return(err);
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/** Copy an undo record to heap.
+@param[in]	roll_ptr	roll pointer to a record that exists
+@param[in,out]	heap		memory heap where copied */
+static
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+	roll_ptr_t	roll_ptr,
+	mem_heap_t*	heap)
+{
+	trx_undo_rec_t*	undo_rec;
+	ulint		rseg_id;
+	uint32_t	page_no;
+	uint16_t	offset;
+	trx_rseg_t*	rseg;
+	bool		is_insert;
+	mtr_t		mtr;
+
+	trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+				 &offset);
+	ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+	ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+	rseg = trx_sys.rseg_array[rseg_id];
+	ut_ad(rseg->is_persistent());
+
+	mtr.start();
+
+	buf_block_t* undo_page = trx_undo_page_get_s_latched(
+		page_id_t(rseg->space->id, page_no), &mtr);
+
+	undo_rec = trx_undo_rec_copy(undo_page->frame + offset, heap);
+
+	mtr.commit();
+
+	return(undo_rec);
+}
+
+/** Copy an undo record to heap.
+@param[in]	roll_ptr	roll pointer to record
+@param[in,out]	heap		memory heap where copied
+@param[in]	trx_id		id of the trx that generated
+				the roll pointer: it points to an
+				undo log of this transaction
+@param[in]	name		table name
+@param[out]	undo_rec	own: copy of the record
+@retval true if the undo log has been
+truncated and we cannot fetch the old version
+@retval false if the undo log record is available
+NOTE: the caller must have latches on the clustered index page. */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+trx_undo_get_undo_rec(
+	roll_ptr_t		roll_ptr,
+	mem_heap_t*		heap,
+	trx_id_t		trx_id,
+	const table_name_t&	name,
+	trx_undo_rec_t**	undo_rec)
+{
+	rw_lock_s_lock(&purge_sys.latch);
+
+	bool missing_history = purge_sys.changes_visible(trx_id, name);
+	if (!missing_history) {
+		*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+	}
+
+	rw_lock_s_unlock(&purge_sys.latch);
+
+	return(missing_history);
+}
+
+#ifdef UNIV_DEBUG
+#define ATTRIB_USED_ONLY_IN_DEBUG
+#else /* UNIV_DEBUG */
+#define ATTRIB_USED_ONLY_IN_DEBUG	MY_ATTRIBUTE((unused))
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+or being purged, which means that it may have been removed */
+bool
+trx_undo_prev_version_build(
+/*========================*/
+	const rec_t*	index_rec ATTRIB_USED_ONLY_IN_DEBUG,
+				/*!< in: clustered index record in the
+				index tree */
+	mtr_t*		index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
+				/*!< in: mtr which contains the latch to
+				index_rec page and purge_view */
+	const rec_t*	rec,	/*!< in: version of a clustered index record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	rec_t**		old_vers,/*!< out, own: previous version, or NULL if
+				rec is the first inserted version, or if
+				history data has been deleted (an error),
+				or if the purge COULD have removed the version
+				though it has not yet done so */
+	mem_heap_t*	v_heap,	/* !< in: memory heap used to create vrow
+				dtuple if it is not yet created. This heap
+				diffs from "heap" above in that it could be
+				prebuilt->old_vers_heap for selection */
+	dtuple_t**	vrow,	/*!< out: virtual column info, if any */
+	ulint		v_status)
+				/*!< in: status determine if it is going
+				into this function by purge thread or not.
+				And if we read "after image" of undo log */
+{
+	trx_undo_rec_t*	undo_rec	= NULL;
+	dtuple_t*	entry;
+	trx_id_t	rec_trx_id;
+	ulint		type;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	upd_t*		update;
+	byte*		ptr;
+	byte		info_bits;
+	ulint		cmpl_info;
+	bool		dummy_extern;
+	byte*		buf;
+
+	ut_ad(!index->table->is_temporary());
+	ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S));
+	ut_ad(index_mtr->memo_contains_page_flagged(index_rec,
+						    MTR_MEMO_PAGE_S_FIX
+						    | MTR_MEMO_PAGE_X_FIX));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_a(index->is_primary());
+
+	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+	*old_vers = NULL;
+
+	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+		/* The record rec is the first inserted version */
+		return(true);
+	}
+
+	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	ut_ad(!index->table->skip_alter_undo);
+
+	if (trx_undo_get_undo_rec(
+		    roll_ptr, heap, rec_trx_id, index->table->name,
+		    &undo_rec)) {
+		if (v_status & TRX_UNDO_PREV_IN_PURGE) {
+			/* We are fetching the record being purged */
+			undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+		} else {
+			/* The undo record may already have been purged,
+			during purge or semi-consistent read. */
+			return(false);
+		}
+	}
+
+	ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+				    &dummy_extern, &undo_no, &table_id);
+
+	if (table_id != index->table->id) {
+		/* The table should have been rebuilt, but purge has
+		not yet removed the undo log records for the
+		now-dropped old table (table_id). */
+		return(true);
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	/* (a) If a clustered index record version is such that the
+	trx id stamp in it is bigger than purge_sys.view, then the
+	BLOBs in that version are known to exist (the purge has not
+	progressed that far);
+
+	(b) if the version is the first version such that trx id in it
+	is less than purge_sys.view, and it is not delete-marked,
+	then the BLOBs in that version are known to exist (the purge
+	cannot have purged the BLOBs referenced by that version
+	yet).
+
+	This function does not fetch any BLOBs.  The callers might, by
+	possibly invoking row_ext_create() via row_build().  However,
+	they should have all needed information in the *old_vers
+	returned by this function.  This is because *old_vers is based
+	on the transaction undo log records.  The function
+	trx_undo_page_fetch_ext() will write BLOB prefixes to the
+	transaction undo log that are at least as long as the longest
+	possible column prefix in a secondary index.  Thus, secondary
+	index entries for *old_vers can be constructed without
+	dereferencing any BLOB pointers. */
+
+	ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+					     roll_ptr, info_bits,
+					     heap, &update);
+	ut_a(ptr);
+
+	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+		/* We should confirm the existence of disowned external data,
+		if the previous version record is delete marked. If the trx_id
+		of the previous record is seen by purge view, we should treat
+		it as missing history, because the disowned external data
+		might be purged already.
+
+		The inherited external data (BLOBs) can be freed (purged)
+		after trx_id was committed, provided that no view was started
+		before trx_id. If the purge view can see the committed
+		delete-marked record by trx_id, no transactions need to access
+		the BLOB. */
+
+		/* the row_upd_changes_disowned_external(update) call could be
+		omitted, but the synchronization on purge_sys.latch is likely
+		more expensive. */
+
+		if ((update->info_bits & REC_INFO_DELETED_FLAG)
+		    && row_upd_changes_disowned_external(update)) {
+			bool	missing_extern;
+
+			rw_lock_s_lock(&purge_sys.latch);
+
+			missing_extern = purge_sys.changes_visible(
+				trx_id,	index->table->name);
+
+			rw_lock_s_unlock(&purge_sys.latch);
+
+			if (missing_extern) {
+				/* treat as a fresh insert, not to
+				cause assertion error at the caller. */
+				return(true);
+			}
+		}
+
+		/* We have to set the appropriate extern storage bits in the
+		old version of the record: the extern bits in rec for those
+		fields that update does NOT update, as well as the bits for
+		those fields that update updates to become externally stored
+		fields. Store the info: */
+
+		entry = row_rec_to_index_entry(rec, index, offsets, heap);
+		/* The page containing the clustered index record
+		corresponding to entry is latched in mtr.  Thus the
+		following call is safe. */
+		if (!row_upd_index_replace_new_col_vals(entry, *index, update,
+							heap)) {
+			ut_a(v_status & TRX_UNDO_PREV_IN_PURGE);
+			return false;
+		}
+
+		/* Get number of externally stored columns in updated record */
+		const ulint n_ext = index->is_primary()
+			? dtuple_get_n_ext(entry) : 0;
+
+		buf = static_cast<byte*>(mem_heap_alloc(
+			heap, rec_get_converted_size(index, entry, n_ext)));
+
+		*old_vers = rec_convert_dtuple_to_rec(buf, index,
+						      entry, n_ext);
+	} else {
+		buf = static_cast<byte*>(mem_heap_alloc(
+			heap, rec_offs_size(offsets)));
+
+		*old_vers = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(*old_vers, index, true, offsets);
+		rec_set_bit_field_1(*old_vers, update->info_bits,
+				    rec_offs_comp(offsets)
+				    ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+				    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+		for (ulint i = 0; i < update->n_fields; i++) {
+			const upd_field_t* uf = upd_get_nth_field(update, i);
+			if (upd_fld_is_virtual_col(uf)) {
+				/* There are no virtual columns in
+				a clustered index record. */
+				continue;
+			}
+			const ulint n = uf->field_no;
+			ut_ad(!dfield_is_ext(&uf->new_val)
+			      == !rec_offs_nth_extern(offsets, n));
+			ut_ad(!rec_offs_nth_default(offsets, n));
+
+			if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+				if (rec_offs_nth_sql_null(offsets, n)) {
+					ut_ad(index->table->is_instant());
+					ut_ad(n >= index->n_core_fields);
+					continue;
+				}
+				ut_ad(!index->table->not_redundant());
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+					- l;
+				*b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+				compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+						    == REC_2BYTE_SQL_NULL_MASK);
+				continue;
+			}
+
+			ulint len;
+			memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+			       uf->new_val.data, uf->new_val.len);
+			if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+				ut_ad(len == UNIV_SQL_NULL);
+				ut_ad(!rec_offs_comp(offsets));
+				ut_ad(uf->new_val.len
+				      == rec_get_nth_field_size(rec, n));
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				*(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+					&= byte(~REC_1BYTE_SQL_NULL_MASK);
+			}
+		}
+	}
+
+	/* Set the old value (which is the after image of an update) in the
+	update vector to dtuple vrow */
+	if (v_status & TRX_UNDO_GET_OLD_V_VALUE) {
+		row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update,
+				     false, NULL, NULL);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_dbg);
+	ut_a(!rec_offs_any_null_extern(
+		*old_vers, rec_get_offsets(*old_vers, index, offsets_dbg,
+					   index->n_core_fields,
+					   ULINT_UNDEFINED, &heap)));
+#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+
+	if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		if (!(*vrow)) {
+			*vrow = dtuple_create_with_vcol(
+				v_heap ? v_heap : heap,
+				dict_table_get_n_cols(index->table),
+				dict_table_get_n_v_cols(index->table));
+			dtuple_init_v_fld(*vrow);
+		}
+
+		ut_ad(index->table->n_v_cols);
+		trx_undo_read_v_cols(index->table, ptr, *vrow,
+				     v_status & TRX_UNDO_PREV_IN_PURGE);
+	}
+
+	return(true);
+}
+
+/** Read virtual column value from undo log
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in,out]	row		the dtuple to fill
+@param[in]	in_purge	whether this is called by purge */
+void
+trx_undo_read_v_cols(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	dtuple_t*		row,
+	bool			in_purge)
+{
+	const byte*     end_ptr;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+
+	end_ptr = ptr + mach_read_from_2(ptr);
+	ptr += 2;
+	while (ptr < end_ptr) {
+		dfield_t* dfield;
+		const byte* field;
+		uint32_t field_no, len, orig_len;
+
+		field_no = mach_read_next_compressed(
+				const_cast<const byte**>(&ptr));
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			ptr = trx_undo_read_v_idx(
+				table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+		}
+
+		ptr = trx_undo_rec_get_col_val(
+			ptr, &field, &len, &orig_len);
+
+		/* The virtual column is no longer indexed or does not exist.
+		This needs to put after trx_undo_rec_get_col_val() so the
+		undo ptr advances */
+		if (field_no == FIL_NULL) {
+			ut_ad(is_virtual);
+			continue;
+		}
+
+		if (is_virtual) {
+			dict_v_col_t*	vcol = dict_table_get_nth_v_col(
+				table, field_no);
+
+			dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+			if (!in_purge
+			    || dfield_get_type(dfield)->mtype == DATA_MISSING) {
+				dict_col_copy_type(
+					&vcol->m_col,
+					dfield_get_type(dfield));
+				dfield_set_data(dfield, field, len);
+			}
+		}
+	}
+
+	ut_ad(ptr == end_ptr);
+}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000..23aa950a
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,984 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#include <my_service_manager.h>
+#include <mysql/service_wsrep.h>
+
+#include "fsp0fsp.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t	trx_rollback_clean_thread_key;
+#endif
+
+/** true if trx_rollback_all_recovered() thread is active */
+bool			trx_rollback_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+const trx_t*		trx_roll_crash_recv_trx;
+
+/** Finish transaction rollback.
+@return	whether the rollback was completed normally
+@retval	false	if the rollback was aborted by shutdown  */
+inline bool trx_t::rollback_finish()
+{
+  mod_tables.clear();
+  if (UNIV_LIKELY(error_state == DB_SUCCESS))
+  {
+    commit();
+    return true;
+  }
+
+  ut_a(error_state == DB_INTERRUPTED);
+  ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+  ut_a(!srv_undo_sources);
+  ut_ad(srv_fast_shutdown);
+  ut_d(in_rollback= false);
+  if (trx_undo_t *&undo= rsegs.m_redo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  commit_low();
+  lock.que_state= TRX_QUE_RUNNING;
+  return false;
+}
+
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
+{
+  mem_heap_t *heap= mem_heap_create(512);
+  roll_node_t *roll_node= roll_node_create(heap);
+  roll_node->savept= savept;
+
+  ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+  {
+    const auto s= state;
+    ut_ad(s == TRX_STATE_ACTIVE ||
+          s == TRX_STATE_PREPARED ||
+          s == TRX_STATE_PREPARED_RECOVERED);
+    if (savept)
+    {
+      ut_ad(s == TRX_STATE_ACTIVE);
+      ut_ad(mysql_thd);
+      ut_ad(!is_recovered);
+    }
+  }
+#endif
+
+  error_state = DB_SUCCESS;
+
+  if (has_logged())
+  {
+    ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+    que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+                                                 nullptr);
+    ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+                                       (que_node_get_parent(thr))));
+    que_run_threads(thr);
+    que_run_threads(roll_node->undo_thr);
+
+    /* Free the memory reserved by the undo graph. */
+    que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+  }
+
+  if (!savept)
+  {
+    rollback_finish();
+    MONITOR_INC(MONITOR_TRX_ROLLBACK);
+  }
+  else
+  {
+    ut_a(error_state == DB_SUCCESS);
+    const undo_no_t limit= savept->least_undo_no;
+    for (trx_mod_tables_t::iterator i= mod_tables.begin();
+	 i != mod_tables.end(); )
+    {
+      trx_mod_tables_t::iterator j= i++;
+      ut_ad(j->second.valid());
+      if (j->second.rollback(limit))
+        mod_tables.erase(j);
+    }
+    lock.que_state= TRX_QUE_RUNNING;
+    MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+  }
+
+  mem_heap_free(heap);
+
+  MONITOR_DEC(MONITOR_TRX_ACTIVE);
+}
+
+/** Initiate rollback.
+@param savept     savepoint
+@return error code or DB_SUCCESS */
+dberr_t trx_t::rollback(trx_savept_t *savept)
+{
+  ut_ad(!trx_mutex_own(this));
+  if (state == TRX_STATE_NOT_STARTED)
+  {
+    error_state= DB_SUCCESS;
+    return DB_SUCCESS;
+  }
+  ut_ad(state == TRX_STATE_ACTIVE);
+#ifdef WITH_WSREP
+  if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+    wsrep_handle_SR_rollback(nullptr, mysql_thd);
+#endif /* WITH_WSREP */
+  rollback_low(savept);
+  return error_state;
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions,
+	then the transaction object does not have an InnoDB session
+	object, and we set a dummy session that we use for all MySQL
+	transactions. */
+
+	trx->rollback_low();
+
+	trx->op_info = "";
+
+	return(trx->error_state);
+}
+
+/** Rollback a transaction used in MySQL
+@param[in, out]	trx	transaction
+@return error code or DB_SUCCESS */
+dberr_t trx_rollback_for_mysql(trx_t* trx)
+{
+	/* We are reading trx->state without holding trx->mutex
+	here, because the rollback should be invoked for a running
+	active MySQL transaction (or recovered prepared transaction)
+	that is associated with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx->will_lock = false;
+		ut_ad(trx->mysql_thd);
+#ifdef WITH_WSREP
+		trx->wsrep= false;
+		trx->lock.was_chosen_as_wsrep_victim= false;
+#endif
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->mysql_thd);
+		ut_ad(!trx->is_recovered);
+		ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		ut_ad(!trx->is_autocommit_non_locking());
+		if (trx->rsegs.m_redo.undo) {
+			/* The XA ROLLBACK of a XA PREPARE transaction
+			will consist of multiple mini-transactions.
+
+			As the very first step of XA ROLLBACK, we must
+			change the undo log state back from
+			TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order
+			to ensure that recovery will complete the
+			rollback.
+
+			Failure to perform this step could cause a
+			situation where we would roll back part of
+			a XA PREPARE transaction, the server would be
+			killed, and finally, the transaction would be
+			recovered in XA PREPARE state, with some of
+			the actions already having been rolled back. */
+			ut_ad(trx->rsegs.m_redo.undo->rseg
+			      == trx->rsegs.m_redo.rseg);
+			mtr_t		mtr;
+			mtr.start();
+			mutex_enter(&trx->rsegs.m_redo.rseg->mutex);
+			if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+				trx_undo_set_state_at_prepare(trx, undo, true,
+							      &mtr);
+			}
+			mutex_exit(&trx->rsegs.m_redo.rseg->mutex);
+			/* Write the redo log for the XA ROLLBACK
+			state change to the global buffer. It is
+			not necessary to flush the redo log. If
+			a durable log write of a later mini-transaction
+			takes place for whatever reason, then this state
+			change will be durable as well. */
+			mtr.commit();
+			ut_ad(mtr.commit_lsn() > 0);
+		}
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		ut_ad(!trx->is_autocommit_non_locking());
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	dberr_t	err;
+
+	/* We are reading trx->state without holding trx->mutex
+	here, because the statement rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->mysql_thd);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->mysql_thd);
+		ut_ad(!trx->is_recovered);
+		ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+
+		trx->op_info = "rollback of SQL statement";
+
+		err = trx->rollback(&trx->last_sql_stat_start);
+
+		if (trx->fts_trx != NULL) {
+			fts_savepoint_rollback_last_stmt(trx);
+		}
+
+		/* The following call should not be needed,
+		but we play it safe: */
+		trx_mark_sql_stat_end(trx);
+
+		trx->op_info = "";
+
+		return(err);
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The statement rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+		if (!strcmp(savep->name, name)) {
+			return(savep);
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
+{
+	UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+	ut_free(savep->name);
+	ut_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep. */
+void
+trx_roll_savepoints_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: free all savepoints starting
+					with this savepoint i*/
+{
+	while (savep != NULL) {
+		trx_named_savept_t*	next_savep;
+
+		next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+		trx_roll_savepoint_free(trx, savep);
+
+		savep = next_savep;
+	}
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	trx_named_savept_t*	savep,	/*!< in/out: savepoint */
+	int64_t*		mysql_binlog_cache_pos)
+					/*!< out: the MySQL binlog
+					cache position corresponding
+					to this savepoint; MySQL needs
+					this information to remove the
+					binlog entries of the queries
+					executed after the savepoint */
+{
+	dberr_t	err;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->mysql_thd);
+
+	/* Free all savepoints strictly later than savep. */
+
+	trx_roll_savepoints_free(
+		trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx->rollback(&savep->savept);
+
+	/* Store the current undo_no of the transaction so that
+	we know where to roll back if we have to roll back the
+	next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+#ifdef WITH_WSREP
+	trx->lock.was_chosen_as_wsrep_victim = false;
+#endif
+	return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	/* We are reading trx->state without holding trx->mutex
+	here, because the savepoint rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->mysql_thd);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep == NULL) {
+		return(DB_NO_SAVEPOINT);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ib::error() << "Transaction has a savepoint "
+			<< savep->name
+			<< " though it is not started";
+		return(DB_ERROR);
+
+	case TRX_STATE_ACTIVE:
+
+		return(trx_rollback_to_savepoint_for_mysql_low(
+				trx, savep, mysql_binlog_cache_pos));
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The savepoint rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t		binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	trx_start_if_not_started_xa(trx, false);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep) {
+		/* There is a savepoint with the same name: free that */
+
+		UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+		ut_free(savep->name);
+		ut_free(savep);
+	}
+
+	/* Create a new savepoint and add it as the last in the list */
+
+	savep = static_cast<trx_named_savept_t*>(
+		ut_malloc_nokey(sizeof(*savep)));
+
+	savep->name = mem_strdup(savepoint_name);
+
+	savep->savept = trx_savept_take(trx);
+
+	savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+	UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED, true));
+	ut_ad(trx->mysql_thd);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep != NULL) {
+		trx_roll_savepoint_free(trx, savep);
+	}
+
+	return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+trx_savept_t
+trx_savept_take(
+/*============*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_savept_t	savept;
+
+	savept.least_undo_no = trx->undo_no;
+
+	return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	roll_node_t*	roll_node;
+	const trx_id_t	trx_id = trx->id;
+
+	ut_ad(trx_id);
+
+	heap = mem_heap_create(512);
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	roll_node = roll_node_create(heap);
+
+	thr->child = roll_node;
+	roll_node->common.parent = thr;
+
+	trx->graph = fork;
+
+	ut_a(thr == que_fork_start_command(fork));
+
+	trx_roll_crash_recv_trx	= trx;
+
+	const bool dictionary_locked = trx_get_dict_operation(trx)
+		!= TRX_DICT_OP_NONE;
+
+	if (dictionary_locked) {
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	que_run_threads(thr);
+	ut_a(roll_node->undo_thr != NULL);
+
+	que_run_threads(roll_node->undo_thr);
+
+	que_graph_free(
+		static_cast<que_t*>(roll_node->undo_thr->common.parent));
+
+	if (UNIV_UNLIKELY(!trx->rollback_finish())) {
+		ut_ad(!dictionary_locked);
+		goto func_exit;
+	}
+
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
+
+	if (!dictionary_locked || !trx->table_id) {
+	} else if (dict_table_t* table = dict_table_open_on_id(
+			   trx->table_id, TRUE, DICT_TABLE_OP_NORMAL)) {
+		ib::info() << "Dropping table " << table->name
+			   << ", with id " << trx->table_id
+			   << " in recovery";
+
+		dict_table_close_and_drop(trx, table);
+
+		trx_commit_for_mysql(trx);
+	}
+
+	ib::info() << "Rolled back recovered transaction " << trx_id;
+
+func_exit:
+	if (dictionary_locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	trx_roll_crash_recv_trx	= NULL;
+}
+
+
+struct trx_roll_count_callback_arg
+{
+  uint32_t n_trx;
+  uint64_t n_rows;
+  trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {}
+};
+
+
+static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
+                                       trx_roll_count_callback_arg *arg)
+{
+  mutex_enter(&element->mutex);
+  if (trx_t *trx= element->trx)
+  {
+    if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
+    {
+      arg->n_trx++;
+      arg->n_rows+= trx->undo_no;
+    }
+  }
+  mutex_exit(&element->mutex);
+  return 0;
+}
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress()
+{
+	time_t now = time(NULL);
+	mutex_enter(&recv_sys.mutex);
+	bool report = recv_sys.report(now);
+	mutex_exit(&recv_sys.mutex);
+
+	if (report) {
+		trx_roll_count_callback_arg arg;
+
+		/* Get number of recovered active transactions and number of
+		rows they modified. Numbers must be accurate, because only this
+		thread is allowed to touch recovered transactions. */
+		trx_sys.rw_trx_hash.iterate_no_dups(
+			trx_roll_count_callback, &arg);
+
+		if (arg.n_rows > 0) {
+			service_manager_extend_timeout(
+				INNODB_EXTEND_TIMEOUT_INTERVAL,
+				"To roll back: " UINT32PF " transactions, "
+				UINT64PF " rows", arg.n_trx, arg.n_rows);
+		}
+
+		ib::info() << "To roll back: " << arg.n_trx
+			   << " transactions, " << arg.n_rows << " rows";
+
+	}
+}
+
+
+static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
+                                               std::vector<trx_t*> *trx_list)
+{
+  mutex_enter(&element->mutex);
+  if (trx_t *trx= element->trx)
+  {
+    mutex_enter(&trx->mutex);
+    if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
+      trx_list->push_back(trx);
+    mutex_exit(&trx->mutex);
+  }
+  mutex_exit(&element->mutex);
+  return 0;
+}
+
+
+/**
+  Rollback any incomplete transactions which were encountered in crash recovery.
+
+  If the transaction already was committed, then we clean up a possible insert
+  undo log. If the transaction was not yet committed, then we roll it back.
+
+  Note: For XA recovered transactions, we rely on MySQL to
+  do rollback. They will be in TRX_STATE_PREPARED state. If the server
+  is shutdown and they are still lingering in trx_sys_t::trx_list
+  then the shutdown will hang.
+
+  @param[in]  all  true=roll back all recovered active transactions;
+                   false=roll back any incomplete dictionary transaction
+*/
+
+void trx_rollback_recovered(bool all)
+{
+  std::vector<trx_t*> trx_list;
+
+  ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+
+  /*
+    Collect list of recovered ACTIVE transaction ids first. Once collected, no
+    other thread is allowed to modify or remove these transactions from
+    rw_trx_hash.
+  */
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
+                                      &trx_list);
+
+  while (!trx_list.empty())
+  {
+    trx_t *trx= trx_list.back();
+    trx_list.pop_back();
+
+    ut_ad(trx);
+    ut_d(trx_mutex_enter(trx));
+    ut_ad(trx->is_recovered);
+    ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+    ut_d(trx_mutex_exit(trx));
+
+    if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
+        srv_fast_shutdown)
+      goto discard;
+
+    if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+        || trx->has_stats_table_lock())
+    {
+      trx_rollback_active(trx);
+      if (trx->error_state != DB_SUCCESS)
+      {
+        ut_ad(trx->error_state == DB_INTERRUPTED);
+        trx->error_state= DB_SUCCESS;
+        ut_ad(!srv_undo_sources);
+        ut_ad(srv_fast_shutdown);
+discard:
+        /* Note: before kill_server() invoked innobase_end() via
+        unireg_end(), it invoked close_connections(), which should initiate
+        the rollback of any user transactions via THD::cleanup() in the
+        connection threads, and wait for all THD::cleanup() to complete.
+        So, no active user transactions should exist at this point.
+
+        srv_undo_sources=false was cleared early in innobase_end().
+
+        Generally, the server guarantees that all connections using
+        InnoDB must be disconnected by the time we are reaching this code,
+        be it during shutdown or UNINSTALL PLUGIN.
+
+        Because there is no possible race condition with any
+        concurrent user transaction, we do not have to invoke
+        trx->commit_state() or wait for !trx->is_referenced()
+        before trx_sys.deregister_rw(trx). */
+        trx_sys.deregister_rw(trx);
+        trx_free_at_shutdown(trx);
+      }
+      else
+        trx->free();
+    }
+  }
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+extern "C"
+os_thread_ret_t
+DECLARE_THREAD(trx_rollback_all_recovered)(void*)
+{
+	my_thread_init();
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(trx_rollback_clean_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	if (trx_sys.rw_trx_hash.size()) {
+		ib::info() << "Starting in background the rollback of"
+			" recovered transactions";
+		trx_rollback_recovered(true);
+		ib::info() << "Rollback of non-prepared transactions"
+			" completed";
+	}
+
+	trx_rollback_is_active = false;
+
+	my_thread_end();
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit();
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	ut_ad(trx_mutex_own(trx));
+
+	heap = mem_heap_create(512);
+	fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	thr->child = row_undo_node_create(trx, thr, heap);
+
+	return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	undo_no_t	roll_limit)	/*!< in: rollback to undo no (for
+					partial undo), 0 if we are rolling back
+					the entire transaction */
+{
+	ut_ad(trx_mutex_own(trx));
+
+	/* Initialize the rollback field in the transaction */
+
+	ut_ad(!trx->roll_limit);
+	ut_ad(!trx->in_rollback);
+
+	trx->roll_limit = roll_limit;
+	trx->in_rollback = true;
+
+	ut_a(trx->roll_limit <= trx->undo_no);
+
+	trx->pages_undone = 0;
+
+	/* Build a 'query' graph which will perform the undo operations */
+
+	que_t*	roll_graph = trx_roll_graph_build(trx);
+
+	trx->graph = roll_graph;
+
+	trx->lock.que_state = TRX_QUE_ROLLING_BACK;
+
+	return(que_fork_start_command(roll_graph));
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+	node->state = ROLL_NODE_SEND;
+
+	node->common.type = QUE_NODE_ROLLBACK;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = ROLL_NODE_SEND;
+	}
+
+	if (node->state == ROLL_NODE_SEND) {
+		trx_t*		trx;
+		ib_id_t		roll_limit;
+
+		trx = thr_get_trx(thr);
+
+		trx_mutex_enter(trx);
+
+		node->state = ROLL_NODE_WAIT;
+
+		ut_a(node->undo_thr == NULL);
+
+		roll_limit = node->savept ? node->savept->least_undo_no : 0;
+
+		trx_commit_or_rollback_prepare(trx);
+
+		node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+		trx_mutex_exit(trx);
+
+	} else {
+		ut_ad(node->state == ROLL_NODE_WAIT);
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000..307f8757
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,768 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+
+#ifdef UNIV_DEBUG
+/** The latest known WSREP XID sequence number */
+static long long wsrep_seqno = -1;
+#endif /* UNIV_DEBUG */
+/** The latest known WSREP XID UUID */
+static unsigned char wsrep_uuid[16];
+
+/** Write the WSREP XID information into rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini transaction */
+static void
+trx_rseg_write_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr)
+{
+	DBUG_ASSERT(xid->gtrid_length >= 0);
+	DBUG_ASSERT(xid->bqual_length >= 0);
+	DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				       + rseg_header->frame,
+				       uint32_t(xid->formatID));
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				       + rseg_header->frame,
+				       uint32_t(xid->gtrid_length));
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				       + rseg_header->frame,
+				       uint32_t(xid->bqual_length));
+
+	const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+						    + xid->bqual_length);
+	mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+				      TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+				      + rseg_header->frame,
+				      xid->data, xid_length);
+	if (xid_length < XIDDATASIZE
+	    && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+		      + rseg_header->frame, field_ref_zero,
+		      XIDDATASIZE - xid_length)) {
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+			    XIDDATASIZE - xid_length, 0);
+	}
+}
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr)
+{
+	ut_ad(wsrep_is_wsrep_xid(xid));
+
+#ifdef UNIV_DEBUG
+	/* Check that seqno is monotonically increasing */
+	long long xid_seqno = wsrep_xid_seqno(xid);
+	const byte* xid_uuid = wsrep_xid_uuid(xid);
+
+	if (xid_seqno != -1
+	    && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
+		ut_ad(xid_seqno > wsrep_seqno);
+	} else {
+		memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
+	}
+	wsrep_seqno = xid_seqno;
+#endif /* UNIV_DEBUG */
+	trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
+}
+
+/** Clear the WSREP XID information from rollback segment header.
+@param[in,out]	block	rollback segment header
+@param[in,out]	mtr 	mini-transaction */
+static void trx_rseg_clear_wsrep_checkpoint(buf_block_t *block, mtr_t *mtr)
+{
+  mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+              TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE - TRX_RSEG_WSREP_XID_INFO,
+              0);
+}
+
+static void
+trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
+{
+	const byte* xid_uuid = wsrep_xid_uuid(xid);
+	/* We must make check against wsrep_uuid here, the
+	trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with
+	xid contents in debug mode and the memcmp() will never give nonzero
+	result. */
+	const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid,
+					     sizeof wsrep_uuid);
+	const trx_rseg_t* rseg = trx_sys.rseg_array[0];
+
+	buf_block_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
+						 mtr);
+	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+					   + rseg_header->frame))) {
+		trx_rseg_format_upgrade(rseg_header, mtr);
+	}
+
+	trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+	if (must_clear_rsegs) {
+		/* Because the UUID part of the WSREP XID differed
+		from current_xid_uuid, the WSREP group UUID was
+		changed, and we must reset the XID in all rollback
+		segment headers. */
+		for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
+			if (const trx_rseg_t* rseg =
+			    trx_sys.rseg_array[rseg_id]) {
+				trx_rseg_clear_wsrep_checkpoint(
+					trx_rsegf_get(rseg->space,
+						      rseg->page_no, mtr),
+				        mtr);
+			}
+		}
+	}
+}
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in]	xid		WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid)
+{
+	mtr_t	mtr;
+	mtr.start();
+	trx_rseg_update_wsrep_checkpoint(xid, &mtr);
+	mtr.commit();
+}
+
+/** Read the WSREP XID information in rollback segment header.
+@param[in]	rseg_header	Rollback segment header
+@param[out]	xid		Transaction XID
+@return	whether the WSREP XID was present */
+static
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
+{
+	int formatID = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				 + rseg_header->frame));
+	if (formatID == 0) {
+		return false;
+	}
+
+	xid.formatID = formatID;
+	xid.gtrid_length = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				 + rseg_header->frame));
+
+	xid.bqual_length = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				 + rseg_header->frame));
+
+	memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+	       + rseg_header->frame, XIDDATASIZE);
+
+	return true;
+}
+
+/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
+@param[in]	page	TRX_SYS page
+@param[out]	xid	WSREP XID (if present)
+@return	whether the WSREP XID is present */
+static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
+{
+	if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			     + TRX_SYS_WSREP_XID_MAGIC_N_FLD
+			     + page)
+	    != TRX_SYS_WSREP_XID_MAGIC_N) {
+		return false;
+	}
+
+	xid.formatID = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_FORMAT + page));
+	xid.gtrid_length = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_GTRID_LEN + page));
+	xid.bqual_length = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_BQUAL_LEN + page));
+	memcpy(xid.data,
+	       TRX_SYS + TRX_SYS_WSREP_XID_INFO
+	       + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
+	return true;
+}
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out]	xid	WSREP XID
+@return	whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid)
+{
+	mtr_t		mtr;
+	long long       max_xid_seqno = -1;
+	bool		found = false;
+
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
+	     rseg_id++, mtr.commit()) {
+		mtr.start();
+		const buf_block_t* sys = trx_sysf_get(&mtr, false);
+		const uint32_t page_no = trx_sysf_rseg_get_page_no(
+			sys, rseg_id);
+
+		if (page_no == FIL_NULL) {
+			continue;
+		}
+
+		const buf_block_t* rseg_header = trx_rsegf_get_new(
+			trx_sysf_rseg_get_space(sys, rseg_id), page_no, &mtr);
+
+		if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+				     + rseg_header->frame)) {
+			continue;
+		}
+
+		XID tmp_xid;
+		long long tmp_seqno = 0;
+		if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
+		    && (tmp_seqno = wsrep_xid_seqno(&tmp_xid))
+		    > max_xid_seqno) {
+			found = true;
+			max_xid_seqno = tmp_seqno;
+			xid = tmp_xid;
+			memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid),
+			       sizeof wsrep_uuid);
+		}
+	}
+
+	return found;
+}
+#endif /* WITH_WSREP */
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out]	rseg_header	rollback segment header page
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
+{
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+  /* Clear also possible garbage at the end of the page. Old
+  InnoDB versions did not initialize unused parts of pages. */
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+              srv_page_size
+              - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+              0);
+}
+
+/** Create a rollback segment header.
+@param[in,out]	space		system, undo, or temporary tablespace
+@param[in]	rseg_id		rollback segment identifier
+@param[in,out]	sys_header	the TRX_SYS page (NULL for temporary rseg)
+@param[in,out]	mtr		mini-transaction
+@return the created rollback segment
+@retval	NULL	on failure */
+buf_block_t*
+trx_rseg_header_create(
+	fil_space_t*	space,
+	ulint		rseg_id,
+	buf_block_t*	sys_header,
+	mtr_t*		mtr)
+{
+	buf_block_t*	block;
+
+	ut_ad(mtr->memo_contains(*space));
+	ut_ad(!sys_header == (space == fil_system.temp_space));
+
+	/* Allocate a new file segment for the rollback segment */
+	block = fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+	if (block == NULL) {
+		/* No space left */
+		return block;
+	}
+
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+	ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG
+				    + block->frame));
+	ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG
+				    + block->frame));
+
+	/* Initialize the history list */
+	flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+	/* Reset the undo log slots */
+	mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG,
+		    TRX_RSEG_N_SLOTS * 4, 0xff);
+
+	if (sys_header) {
+		/* Add the rollback segment info to the free slot in
+		the trx system header */
+
+		mtr->write<4,mtr_t::MAYBE_NOP>(
+			*sys_header,
+			TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+			+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+			+ sys_header->frame, space->id);
+		mtr->write<4,mtr_t::MAYBE_NOP>(
+			*sys_header,
+			TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
+			+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+			+ sys_header->frame, block->page.id().page_no());
+	}
+
+	return block;
+}
+
+/** Free a rollback segment in memory. */
+void
+trx_rseg_mem_free(trx_rseg_t* rseg)
+{
+	trx_undo_t*	undo;
+	trx_undo_t*	next_undo;
+
+	mutex_free(&rseg->mutex);
+
+	/* There can't be any active transactions. */
+	ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+
+	for (undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
+
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+
+		UT_LIST_REMOVE(rseg->undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+		ut_free(undo);
+	}
+
+	ut_free(rseg);
+}
+
+/** Create a rollback segment object.
+@param[in]	id		rollback segment id
+@param[in]	space		space where the segment is placed
+@param[in]	page_no		page number of the segment header */
+static
+trx_rseg_t*
+trx_rseg_mem_create(ulint id, fil_space_t* space, uint32_t page_no)
+{
+	trx_rseg_t* rseg = static_cast<trx_rseg_t*>(
+		ut_zalloc_nokey(sizeof *rseg));
+
+	rseg->id = id;
+	rseg->space = space;
+	rseg->page_no = page_no;
+	rseg->last_page_no = FIL_NULL;
+	rseg->curr_size = 1;
+
+	mutex_create(rseg->is_persistent()
+		     ? LATCH_ID_REDO_RSEG : LATCH_ID_NOREDO_RSEG,
+		     &rseg->mutex);
+
+	UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list);
+	UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list);
+
+	return(rseg);
+}
+
+/** Read the undo log lists.
+@param[in,out]  rseg            rollback segment
+@param[in,out]  max_trx_id      maximum observed transaction identifier
+@param[in]      rseg_header     rollback segment header
+@return error code */
+static dberr_t trx_undo_lists_init(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+                                   const buf_block_t *rseg_header)
+{
+  ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+  for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++)
+  {
+    uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i);
+    if (page_no != FIL_NULL)
+    {
+      const trx_undo_t *undo= trx_undo_mem_create_at_db_start(rseg, i, page_no,
+                                                              max_trx_id);
+      if (!undo)
+        return DB_CORRUPTION;
+      rseg->curr_size+= undo->size;
+      MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Restore the state of a persistent rollback segment.
+@param[in,out]	rseg		persistent rollback segment
+@param[in,out]	max_trx_id	maximum observed transaction identifier
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, trx_id_t &max_trx_id,
+                                    mtr_t *mtr)
+{
+	buf_block_t* rseg_hdr = trx_rsegf_get_new(
+		rseg->space->id, rseg->page_no, mtr);
+
+	if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->frame)) {
+		trx_id_t id = mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID
+					       + rseg_hdr->frame);
+
+		if (id > max_trx_id) {
+			max_trx_id = id;
+		}
+
+		const byte* binlog_name = TRX_RSEG + TRX_RSEG_BINLOG_NAME
+			+ rseg_hdr->frame;
+		if (*binlog_name) {
+			lsn_t lsn = mach_read_from_8(my_assume_aligned<8>(
+							     FIL_PAGE_LSN
+							     + rseg_hdr
+							     ->frame));
+			compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof
+					    trx_sys.recovered_binlog_filename);
+			if (lsn > trx_sys.recovered_binlog_lsn) {
+				trx_sys.recovered_binlog_lsn = lsn;
+				trx_sys.recovered_binlog_offset
+					= mach_read_from_8(
+						TRX_RSEG
+						+ TRX_RSEG_BINLOG_OFFSET
+						+ rseg_hdr->frame);
+				memcpy(trx_sys.recovered_binlog_filename,
+				       binlog_name,
+				       TRX_RSEG_BINLOG_NAME_LEN);
+			}
+
+#ifdef WITH_WSREP
+			trx_rseg_read_wsrep_checkpoint(
+				rseg_hdr, trx_sys.recovered_wsrep_xid);
+#endif
+		}
+	}
+
+	if (srv_operation == SRV_OPERATION_RESTORE) {
+		/* mariabackup --prepare only deals with
+		the redo log and the data files, not with
+		transactions or the data dictionary. */
+		return DB_SUCCESS;
+	}
+
+	/* Initialize the undo log lists according to the rseg header */
+
+	rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+					   + rseg_hdr->frame)
+		+ 1;
+	if (dberr_t err = trx_undo_lists_init(rseg, max_trx_id, rseg_hdr)) {
+		return err;
+	}
+
+	if (auto len = flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY
+				    + rseg_hdr->frame)) {
+		trx_sys.rseg_history_len += len;
+
+		fil_addr_t node_addr = flst_get_last(TRX_RSEG
+						     + TRX_RSEG_HISTORY
+						     + rseg_hdr->frame);
+		node_addr.boffset = static_cast<uint16_t>(
+			node_addr.boffset - TRX_UNDO_HISTORY_NODE);
+
+		rseg->last_page_no = node_addr.page;
+
+		const buf_block_t* block = trx_undo_page_get(
+			page_id_t(rseg->space->id, node_addr.page), mtr);
+
+		trx_id_t id = mach_read_from_8(block->frame + node_addr.boffset
+					       + TRX_UNDO_TRX_ID);
+		if (id > max_trx_id) {
+			max_trx_id = id;
+		}
+		id = mach_read_from_8(block->frame + node_addr.boffset
+				      + TRX_UNDO_TRX_NO);
+		if (id > max_trx_id) {
+			max_trx_id = id;
+		}
+
+		rseg->set_last_commit(node_addr.boffset, id);
+		unsigned purge = mach_read_from_2(block->frame
+						  + node_addr.boffset
+						  + TRX_UNDO_NEEDS_PURGE);
+		ut_ad(purge <= 1);
+		rseg->needs_purge = purge != 0;
+
+		if (rseg->last_page_no != FIL_NULL) {
+
+			/* There is no need to cover this operation by the purge
+			mutex because we are still bootstrapping. */
+			purge_sys.purge_queue.push(*rseg);
+		}
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Read binlog metadata from the TRX_SYS page, in case we are upgrading
+from MySQL or a MariaDB version older than 10.3.5. */
+static void trx_rseg_init_binlog_info(const page_t* page)
+{
+	if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+			     + page)
+	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+		memcpy(trx_sys.recovered_binlog_filename,
+		       TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME
+		       + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN);
+		trx_sys.recovered_binlog_offset = mach_read_from_8(
+			TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+			+ TRX_SYS + page);
+	}
+
+#ifdef WITH_WSREP
+	trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
+#endif
+}
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init()
+{
+	trx_id_t max_trx_id = 0;
+
+	*trx_sys.recovered_binlog_filename = '\0';
+	trx_sys.recovered_binlog_offset = 0;
+#ifdef WITH_WSREP
+	trx_sys.recovered_wsrep_xid.null();
+	XID wsrep_sys_xid;
+	wsrep_sys_xid.null();
+	bool wsrep_xid_in_rseg_found = false;
+#endif
+	mtr_t mtr;
+	dberr_t err = DB_SUCCESS;
+
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+		mtr.start();
+		if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+			if (rseg_id == 0) {
+				/* In case this is an upgrade from
+				before MariaDB 10.3.5, fetch the base
+				information from the TRX_SYS page. */
+				max_trx_id = mach_read_from_8(
+					TRX_SYS + TRX_SYS_TRX_ID_STORE
+					+ sys->frame);
+				trx_rseg_init_binlog_info(sys->frame);
+#ifdef WITH_WSREP
+				wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+#endif
+			}
+
+			const uint32_t	page_no = trx_sysf_rseg_get_page_no(
+				sys, rseg_id);
+			if (page_no != FIL_NULL) {
+				trx_rseg_t* rseg = trx_rseg_mem_create(
+					rseg_id,
+					fil_space_get(trx_sysf_rseg_get_space(
+							      sys, rseg_id)),
+					page_no);
+				ut_ad(rseg->is_persistent());
+				ut_ad(rseg->id == rseg_id);
+				ut_ad(!trx_sys.rseg_array[rseg_id]);
+				trx_sys.rseg_array[rseg_id] = rseg;
+				if ((err = trx_rseg_mem_restore(
+					     rseg, max_trx_id, &mtr))
+				    != DB_SUCCESS) {
+					mtr.commit();
+					break;
+				}
+#ifdef WITH_WSREP
+				if (!wsrep_sys_xid.is_null() &&
+				    !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
+					wsrep_xid_in_rseg_found = true;
+					ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
+						     wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
+						     sizeof wsrep_uuid)
+					      || wsrep_xid_seqno(
+						      &wsrep_sys_xid)
+					      <= wsrep_xid_seqno(
+						      &trx_sys.recovered_wsrep_xid));
+				}
+#endif
+			}
+		}
+
+		mtr.commit();
+	}
+
+	if (err != DB_SUCCESS) {
+		for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+			if (trx_rseg_t*& rseg = trx_sys.rseg_array[rseg_id]) {
+				while (trx_undo_t* u= UT_LIST_GET_FIRST(
+					     rseg->undo_list)) {
+					UT_LIST_REMOVE(rseg->undo_list, u);
+					ut_free(u);
+				}
+				trx_rseg_mem_free(rseg);
+				rseg = NULL;
+			}
+		}
+		return err;
+	}
+
+#ifdef WITH_WSREP
+	if (!wsrep_sys_xid.is_null()) {
+		/* Upgrade from a version prior to 10.3.5,
+		where WSREP XID was stored in TRX_SYS page.
+		If no rollback segment has a WSREP XID set,
+		we must copy the XID found in TRX_SYS page
+		to rollback segments. */
+		mtr.start();
+
+		if (!wsrep_xid_in_rseg_found) {
+			trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
+		}
+
+		/* Finally, clear WSREP XID in TRX_SYS page. */
+		mtr.memset(trx_sysf_get(&mtr),
+			   TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+			   TRX_SYS_WSREP_XID_LEN, 0);
+		mtr.commit();
+	}
+#endif
+
+	trx_sys.init_max_trx_id(max_trx_id + 1);
+	return DB_SUCCESS;
+}
+
+/** Create a persistent rollback segment.
+@param[in]	space_id	system or undo tablespace id
+@return pointer to new rollback segment
+@retval	NULL	on failure */
+trx_rseg_t*
+trx_rseg_create(ulint space_id)
+{
+	trx_rseg_t*		rseg = NULL;
+	mtr_t			mtr;
+
+	mtr.start();
+
+	fil_space_t*	space = mtr_x_lock_space(space_id, &mtr);
+	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+
+	if (buf_block_t* sys_header = trx_sysf_get(&mtr)) {
+		ulint	rseg_id = trx_sys_rseg_find_free(sys_header);
+		if (buf_block_t* rblock = rseg_id == ULINT_UNDEFINED
+		    ? NULL
+		    : trx_rseg_header_create(space, rseg_id, sys_header,
+					     &mtr)) {
+			ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id)
+			      == space_id);
+			rseg = trx_rseg_mem_create(rseg_id, space,
+						   rblock->page.id().
+						   page_no());
+			ut_ad(rseg->id == rseg_id);
+			ut_ad(rseg->is_persistent());
+			ut_ad(!trx_sys.rseg_array[rseg->id]);
+			trx_sys.rseg_array[rseg->id] = rseg;
+		}
+	}
+
+	mtr.commit();
+
+	return(rseg);
+}
+
+/** Create the temporary rollback segments. */
+void
+trx_temp_rseg_create()
+{
+	mtr_t		mtr;
+
+	for (ulong i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		mtr.start();
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		mtr_x_lock_space(fil_system.temp_space, &mtr);
+
+		buf_block_t* rblock = trx_rseg_header_create(
+			fil_system.temp_space, i, NULL, &mtr);
+		trx_rseg_t* rseg = trx_rseg_mem_create(
+			i, fil_system.temp_space, rblock->page.id().page_no());
+		ut_ad(!rseg->is_persistent());
+		ut_ad(!trx_sys.temp_rsegs[i]);
+		trx_sys.temp_rsegs[i] = rseg;
+		mtr.commit();
+	}
+}
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	trx		committing transaction
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
+                                   mtr_t *mtr)
+{
+	DBUG_LOG("trx", "trx_mysql_binlog_offset: " << trx->mysql_log_offset);
+
+	const size_t len = strlen(trx->mysql_log_file_name) + 1;
+
+	ut_ad(len > 1);
+
+	if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN)) {
+		return;
+	}
+
+	mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_BINLOG_OFFSET
+				       + rseg_header->frame,
+				       trx->mysql_log_offset);
+
+	void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame;
+
+	if (memcmp(trx->mysql_log_file_name, name, len)) {
+		mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
+	}
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000..3064645f
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,339 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+#include "mysqld.h"
+#include "sql_error.h"
+
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The transaction system */
+trx_sys_t		trx_sys;
+
+/** Check whether transaction id is valid.
+@param[in]	id              transaction id to check
+@param[in]      name            table name */
+void
+ReadViewBase::check_trx_id_sanity(
+	trx_id_t		id,
+	const table_name_t&	name)
+{
+	if (id >= trx_sys.get_max_trx_id()) {
+
+		ib::warn() << "A transaction id"
+			   << " in a record of table "
+			   << name
+			   << " is newer than the"
+			   << " system-wide maximum.";
+		ut_ad(0);
+		THD *thd = current_thd;
+		if (thd != NULL) {
+			char    table_name[MAX_FULL_NAME_LEN + 1];
+
+			innobase_format_name(
+				table_name, sizeof(table_name),
+				name.m_name);
+
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_SIGNAL_WARN,
+					    "InnoDB: Transaction id"
+					    " in a record of table"
+					    " %s is newer than system-wide"
+					    " maximum.", table_name);
+		}
+	}
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+uint	trx_rseg_n_slots_debug = 0;
+#endif
+
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset()
+{
+	if (!*trx_sys.recovered_binlog_filename) {
+		return;
+	}
+
+	ib::info() << "Last binlog file '"
+		<< trx_sys.recovered_binlog_filename
+		<< "', position "
+		<< trx_sys.recovered_binlog_offset;
+}
+
+/** Find an available rollback segment.
+@param[in]	sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header)
+{
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+		if (trx_sysf_rseg_get_page_no(sys_header, rseg_id)
+		    == FIL_NULL) {
+			return rseg_id;
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Count the number of initialized persistent rollback segment slots. */
+static
+void
+trx_sysf_get_n_rseg_slots()
+{
+	mtr_t		mtr;
+	mtr.start();
+
+	srv_available_undo_logs = 0;
+	if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) {
+		for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+			srv_available_undo_logs
+				+= trx_sysf_rseg_get_page_no(sys_header,
+							     rseg_id)
+				!= FIL_NULL;
+		}
+	}
+
+	mtr.commit();
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint		slot_no;
+	buf_block_t*	block;
+
+	ut_ad(mtr);
+
+	/* Note that below we first reserve the file space x-latch, and
+	then enter the kernel: we must do it in this order to conform
+	to the latching order rules. */
+
+	mtr_x_lock_space(fil_system.sys_space, mtr);
+	compile_time_assert(TRX_SYS_SPACE == 0);
+
+	/* Create the trx sys file block in a new allocated file segment */
+	block = fseg_create(fil_system.sys_space,
+			    TRX_SYS + TRX_SYS_FSEG_HEADER,
+			    mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+	mtr->write<2>(*block, FIL_PAGE_TYPE + block->frame,
+		      FIL_PAGE_TYPE_TRX_SYS);
+
+	ut_ad(!mach_read_from_4(block->frame
+				+ TRX_SYS_DOUBLEWRITE
+				+ TRX_SYS_DOUBLEWRITE_MAGIC));
+
+	/* Reset the rollback segment slots.  Old versions of InnoDB
+	(before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+	that the whole array is initialized. */
+	compile_time_assert(256 >= TRX_SYS_N_RSEGS);
+	compile_time_assert(TRX_SYS + TRX_SYS_RSEGS
+			    + 256 * TRX_SYS_RSEG_SLOT_SIZE
+			    <= UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END);
+	mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS,
+		    256 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+	/* Initialize all of the page.  This part used to be uninitialized. */
+	mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS
+		    + 256 * TRX_SYS_RSEG_SLOT_SIZE,
+		    srv_page_size
+		    - (FIL_PAGE_DATA_END + TRX_SYS + TRX_SYS_RSEGS
+		       + 256 * TRX_SYS_RSEG_SLOT_SIZE),
+		    0);
+
+	/* Create the first rollback segment in the SYSTEM tablespace */
+	slot_no = trx_sys_rseg_find_free(block);
+	buf_block_t* rblock = trx_rseg_header_create(fil_system.sys_space,
+						     slot_no, block, mtr);
+
+	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+	ut_a(rblock->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+}
+
+/** Create the instance */
+void
+trx_sys_t::create()
+{
+	ut_ad(this == &trx_sys);
+	ut_ad(!is_initialised());
+	m_initialised = true;
+	trx_list.create();
+	rseg_history_len= 0;
+
+	rw_trx_hash.init();
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+void
+trx_sys_create_sys_pages(void)
+/*==========================*/
+{
+	mtr_t	mtr;
+
+	mtr_start(&mtr);
+
+	trx_sysf_create(&mtr);
+
+	mtr_commit(&mtr);
+}
+
+/** Create the rollback segments.
+@return	whether the creation succeeded */
+bool
+trx_sys_create_rsegs()
+{
+	/* srv_available_undo_logs reflects the number of persistent
+	rollback segments that have been initialized in the
+	transaction system header page. */
+	ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
+
+	if (high_level_read_only) {
+		srv_available_undo_logs = 0;
+		return(true);
+	}
+
+	/* This is executed in single-threaded mode therefore it is not
+	necessary to use the same mtr in trx_rseg_create(). n_used cannot
+	change while the function is executing. */
+	trx_sysf_get_n_rseg_slots();
+
+	ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS);
+
+	/* The first persistent rollback segment is always initialized
+	in the system tablespace. */
+	ut_a(srv_available_undo_logs > 0);
+
+	for (ulint i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+	     i++, srv_available_undo_logs++) {
+		/* Tablespace 0 is the system tablespace.
+		Dedicated undo log tablespaces start from 1. */
+		ulint space = srv_undo_tablespaces > 0
+			? (i % srv_undo_tablespaces)
+			+ srv_undo_space_id_start
+			: TRX_SYS_SPACE;
+
+		if (!trx_rseg_create(space)) {
+			ib::error() << "Unable to allocate the"
+				" requested innodb_undo_logs";
+			return(false);
+		}
+
+		/* Increase the number of active undo
+		tablespace in case new rollback segment
+		assigned to new undo tablespace. */
+		if (space > srv_undo_tablespaces_active) {
+			srv_undo_tablespaces_active++;
+
+			ut_ad(srv_undo_tablespaces_active == space);
+		}
+	}
+
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+	ib::info info;
+	info << srv_available_undo_logs;
+	if (srv_undo_tablespaces_active) {
+		info << " rollback segments in " << srv_undo_tablespaces_active
+		<< " undo tablespaces are active.";
+	} else {
+		info << " rollback segments are active.";
+	}
+
+	return(true);
+}
+
+/** Close the transaction system on shutdown */
+void
+trx_sys_t::close()
+{
+	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+	if (!is_initialised()) {
+		return;
+	}
+
+	if (size_t size = view_count()) {
+		ib::error() << "All read views were not closed before"
+			" shutdown: " << size << " read views open";
+	}
+
+	rw_trx_hash.destroy();
+
+	/* There can't be any active transactions. */
+
+	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		if (trx_rseg_t* rseg = rseg_array[i]) {
+			trx_rseg_mem_free(rseg);
+		}
+
+		if (trx_rseg_t* rseg = temp_rsegs[i]) {
+			trx_rseg_mem_free(rseg);
+		}
+	}
+
+	ut_a(trx_list.empty());
+	trx_list.close();
+	m_initialised = false;
+}
+
+/** @return total number of active (non-prepared) transactions */
+ulint trx_sys_t::any_active_transactions()
+{
+  uint32_t total_trx= 0;
+
+  trx_sys.trx_list.for_each([&total_trx](const trx_t &trx) {
+    if (trx.state == TRX_STATE_COMMITTED_IN_MEMORY ||
+        (trx.state == TRX_STATE_ACTIVE && trx.id))
+      total_trx++;
+  });
+
+  return total_trx;
+}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000..cf8fa17c
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2300 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
+
+#include <mysql/service_thd_error_context.h>
+
+#include "btr0sea.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "trx0xa.h"
+#include "ut0pool.h"
+#include "ut0vec.h"
+
+#include <set>
+#include <new>
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+const byte trx_id_max_bytes[8] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/** The bit pattern corresponding to max timestamp */
+const byte timestamp_max_bytes[7] = {
+	0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
+};
+
+
+static const ulint MAX_DETAILED_ERROR_LEN = 256;
+
+/** Set of table_id */
+typedef std::set<
+	table_id_t,
+	std::less<table_id_t>,
+	ut_allocator<table_id_t> >	table_id_set;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
+	trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
+}
+
+/********************************************************************//**
+Initialize transaction object.
+@param trx trx to initialize */
+static
+void
+trx_init(
+/*=====*/
+	trx_t*	trx)
+{
+	trx->state = TRX_STATE_NOT_STARTED;
+
+	trx->is_recovered = false;
+
+	trx->op_info = "";
+
+	trx->active_commit_ordered = false;
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->check_foreigns = true;
+
+	trx->check_unique_secondary = true;
+
+	trx->lock.n_rec_locks = 0;
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+
+	trx->table_id = 0;
+
+	trx->error_state = DB_SUCCESS;
+
+	trx->error_key_num = ULINT_UNDEFINED;
+
+	trx->undo_no = 0;
+
+	trx->rsegs.m_redo.rseg = NULL;
+
+	trx->rsegs.m_noredo.rseg = NULL;
+
+	trx->read_only = false;
+
+	trx->auto_commit = false;
+
+	trx->will_lock = false;
+
+	trx->ddl = false;
+
+	trx->internal = false;
+
+	ut_d(trx->start_file = 0);
+
+	ut_d(trx->start_line = 0);
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->lock.que_state = TRX_QUE_RUNNING;
+
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	ut_ad(!trx->read_view.is_open());
+
+	trx->lock.rec_cached = 0;
+
+	trx->lock.table_cached = 0;
+#ifdef WITH_WSREP
+	ut_ad(!trx->wsrep);
+	ut_ad(!trx->wsrep_UK_scan);
+#endif /* WITH_WSREP */
+}
+
+/** For managing the life-cycle of the trx_t instance that we get
+from the pool. */
+struct TrxFactory {
+
+	/** Initializes a transaction object. It must be explicitly started
+	with trx_start_if_not_started() before using it. The default isolation
+	level is TRX_ISO_REPEATABLE_READ.
+	@param trx Transaction instance to initialise */
+	static void init(trx_t* trx)
+	{
+		/* Explicitly call the constructor of the already
+		allocated object. trx_t objects are allocated by
+		ut_zalloc_nokey() in Pool::Pool() which would not call
+		the constructors of the trx_t members. */
+		new(&trx->mod_tables) trx_mod_tables_t();
+
+		new(&trx->lock.table_locks) lock_list();
+
+		new(&trx->read_view) ReadView();
+
+		trx->rw_trx_hash_pins = 0;
+		trx_init(trx);
+
+		trx->dict_operation_lock_mode = 0;
+
+		trx->xid = UT_NEW_NOKEY(xid_t());
+
+		trx->detailed_error = reinterpret_cast<char*>(
+			ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
+
+		trx->lock.lock_heap = mem_heap_create_typed(
+			1024, MEM_HEAP_FOR_LOCK_HEAP);
+
+		lock_trx_lock_list_init(&trx->lock.trx_locks);
+
+		UT_LIST_INIT(trx->lock.evicted_tables,
+			     &dict_table_t::table_LRU);
+
+		UT_LIST_INIT(
+			trx->trx_savepoints,
+			&trx_named_savept_t::trx_savepoints);
+
+		mutex_create(LATCH_ID_TRX, &trx->mutex);
+	}
+
+	/** Release resources held by the transaction object.
+	@param trx the transaction for which to release resources */
+	static void destroy(trx_t* trx)
+	{
+#ifdef __SANITIZE_ADDRESS__
+		/* Unpoison the memory for AddressSanitizer */
+		MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+		/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+		changing the state of the V bits (which indicate
+		which bits are initialized).
+		We will declare the contents as initialized.
+		We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+		MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+		ut_a(trx->magic_n == TRX_MAGIC_N);
+		ut_ad(!trx->mysql_thd);
+
+		ut_a(trx->lock.wait_lock == NULL);
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(trx->dict_operation_lock_mode == 0);
+
+		if (trx->lock.lock_heap != NULL) {
+			mem_heap_free(trx->lock.lock_heap);
+			trx->lock.lock_heap = NULL;
+		}
+
+		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+		ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+		UT_DELETE(trx->xid);
+		ut_free(trx->detailed_error);
+
+		mutex_free(&trx->mutex);
+
+		trx->mod_tables.~trx_mod_tables_t();
+
+		ut_ad(!trx->read_view.is_open());
+
+		trx->lock.table_locks.~lock_list();
+
+		trx->read_view.~ReadView();
+	}
+};
+
+/** The lock strategy for TrxPool */
+struct TrxPoolLock {
+	TrxPoolLock() { }
+
+	/** Create the mutex */
+	void create()
+	{
+		mutex_create(LATCH_ID_TRX_POOL, &m_mutex);
+	}
+
+	/** Acquire the mutex */
+	void enter() { mutex_enter(&m_mutex); }
+
+	/** Release the mutex */
+	void exit() { mutex_exit(&m_mutex); }
+
+	/** Free the mutex */
+	void destroy() { mutex_free(&m_mutex); }
+
+	/** Mutex to use */
+	ib_mutex_t	m_mutex;
+};
+
+/** The lock strategy for the TrxPoolManager */
+struct TrxPoolManagerLock {
+	TrxPoolManagerLock() { }
+
+	/** Create the mutex */
+	void create()
+	{
+		mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex);
+	}
+
+	/** Acquire the mutex */
+	void enter() { mutex_enter(&m_mutex); }
+
+	/** Release the mutex */
+	void exit() { mutex_exit(&m_mutex); }
+
+	/** Free the mutex */
+	void destroy() { mutex_free(&m_mutex); }
+
+	/** Mutex to use */
+	ib_mutex_t	m_mutex;
+};
+
+/** Use explicit mutexes for the trx_t pool and its manager. */
+typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
+typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
+
+/** The trx_t pool manager */
+static trx_pools_t* trx_pools;
+
+/** Size of on trx_t pool in bytes. */
+static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
+
+/** Create the trx_t pool */
+void
+trx_pool_init()
+{
+	trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
+
+	ut_a(trx_pools != 0);
+}
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close()
+{
+	UT_DELETE(trx_pools);
+
+	trx_pools = 0;
+}
+
+/** @return an allocated transaction */
+trx_t *trx_create()
+{
+	trx_t*	trx = trx_pools->get();
+
+#ifdef __SANITIZE_ADDRESS__
+	/* Unpoison the memory for AddressSanitizer.
+	It may have been poisoned in trx_t::free().*/
+	MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+	/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+	changing the state of the V bits (which indicate
+	which bits are initialized).
+	We will declare the contents as initialized.
+	We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+	MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+	trx->assert_freed();
+
+	mem_heap_t*	heap;
+	ib_alloc_t*	alloc;
+
+	/* We just got trx from pool, it should be non locking */
+	ut_ad(!trx->will_lock);
+	ut_ad(!trx->rw_trx_hash_pins);
+
+	DBUG_LOG("trx", "Create: " << trx);
+
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+
+	alloc = ib_heap_allocator_create(heap);
+
+	trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
+
+	ut_ad(trx->mod_tables.empty());
+	ut_ad(trx->lock.n_rec_locks == 0);
+	ut_ad(trx->lock.table_cached == 0);
+	ut_ad(trx->lock.rec_cached == 0);
+	ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+#ifdef WITH_WSREP
+	ut_ad(!trx->wsrep_UK_scan);
+#endif /* WITH_WSREP */
+
+	trx_sys.register_trx(trx);
+
+	return(trx);
+}
+
+/** Free the memory to trx_pools */
+void trx_t::free()
+{
+  MEM_CHECK_DEFINED(this, sizeof *this);
+
+  ut_ad(!n_mysql_tables_in_use);
+  ut_ad(!mysql_log_file_name);
+  ut_ad(!mysql_n_tables_locked);
+  ut_ad(!internal);
+  ut_ad(!will_lock);
+  ut_ad(error_state == DB_SUCCESS);
+  ut_ad(magic_n == TRX_MAGIC_N);
+  ut_ad(!read_only);
+  ut_ad(!lock.wait_lock);
+
+  dict_operation= TRX_DICT_OP_NONE;
+  trx_sys.deregister_trx(this);
+  assert_freed();
+  trx_sys.rw_trx_hash.put_pins(this);
+
+  mysql_thd= nullptr;
+
+  // FIXME: We need to avoid this heap free/alloc for each commit.
+  if (autoinc_locks)
+  {
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    /* We allocated a dedicated heap for the vector. */
+    ib_vector_free(autoinc_locks);
+    autoinc_locks= NULL;
+  }
+
+  mod_tables.clear();
+
+  MEM_NOACCESS(&n_ref, sizeof n_ref);
+  /* do not poison mutex */
+  MEM_NOACCESS(&id, sizeof id);
+  MEM_NOACCESS(&state, sizeof state);
+  MEM_NOACCESS(&is_recovered, sizeof is_recovered);
+#ifdef WITH_WSREP
+  MEM_NOACCESS(&wsrep, sizeof wsrep);
+#endif
+  read_view.mem_noaccess();
+  MEM_NOACCESS(&lock, sizeof lock);
+  MEM_NOACCESS(&op_info, sizeof op_info);
+  MEM_NOACCESS(&isolation_level, sizeof isolation_level);
+  MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+  MEM_NOACCESS(&is_registered, sizeof is_registered);
+  MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
+  MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
+  MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
+  MEM_NOACCESS(&must_flush_log_later, sizeof must_flush_log_later);
+  MEM_NOACCESS(&duplicates, sizeof duplicates);
+  MEM_NOACCESS(&dict_operation, sizeof dict_operation);
+  MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
+  MEM_NOACCESS(&start_time, sizeof start_time);
+  MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
+  MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
+  MEM_NOACCESS(&table_id, sizeof table_id);
+  MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
+  MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
+  MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
+  MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
+  MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
+  MEM_NOACCESS(&error_state, sizeof error_state);
+  MEM_NOACCESS(&error_info, sizeof error_info);
+  MEM_NOACCESS(&error_key_num, sizeof error_key_num);
+  MEM_NOACCESS(&graph, sizeof graph);
+  MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
+  MEM_NOACCESS(&undo_no, sizeof undo_no);
+  MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
+  MEM_NOACCESS(&rsegs, sizeof rsegs);
+  MEM_NOACCESS(&roll_limit, sizeof roll_limit);
+  MEM_NOACCESS(&in_rollback, sizeof in_rollback);
+  MEM_NOACCESS(&pages_undone, sizeof pages_undone);
+  MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
+  MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
+  MEM_NOACCESS(&read_only, sizeof read_only);
+  MEM_NOACCESS(&auto_commit, sizeof auto_commit);
+  MEM_NOACCESS(&will_lock, sizeof will_lock);
+  MEM_NOACCESS(&fts_trx, sizeof fts_trx);
+  MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
+  MEM_NOACCESS(&flush_tables, sizeof flush_tables);
+  MEM_NOACCESS(&ddl, sizeof ddl);
+  MEM_NOACCESS(&internal, sizeof internal);
+#ifdef UNIV_DEBUG
+  MEM_NOACCESS(&start_line, sizeof start_line);
+  MEM_NOACCESS(&start_file, sizeof start_file);
+#endif /* UNIV_DEBUG */
+  MEM_NOACCESS(&xid, sizeof xid);
+  MEM_NOACCESS(&mod_tables, sizeof mod_tables);
+  MEM_NOACCESS(&detailed_error, sizeof detailed_error);
+#ifdef WITH_WSREP
+  ut_ad(!wsrep_UK_scan);
+  MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
+#endif /* WITH_WSREP */
+  MEM_NOACCESS(&magic_n, sizeof magic_n);
+  trx_pools->mem_free(this);
+}
+
+/** Transition to committed state, to release implicit locks. */
+inline void trx_t::commit_state()
+{
+  ut_ad(state == TRX_STATE_PREPARED
+	|| state == TRX_STATE_PREPARED_RECOVERED
+	|| state == TRX_STATE_ACTIVE);
+  /* This makes the transaction committed in memory and makes its
+  changes to data visible to other transactions. NOTE that there is a
+  small discrepancy from the strict formal visibility rules here: a
+  user of the database can see modifications made by another
+  transaction T even before the necessary redo log segment has been
+  flushed to the disk. If the database happens to crash before the
+  flush, the user has seen modifications from T which will never be a
+  committed transaction. However, any transaction T2 which sees the
+  modifications of the committing transaction T, and which also itself
+  makes modifications to the database, will get an lsn larger than the
+  committing transaction T. In the case where the log flush fails, and
+  T never gets committed, also T2 will never get committed. */
+  trx_mutex_enter(this);
+  state= TRX_STATE_COMMITTED_IN_MEMORY;
+  trx_mutex_exit(this);
+  ut_ad(id || !is_referenced());
+}
+
+/** Release any explicit locks of a committing transaction. */
+inline void trx_t::release_locks()
+{
+  DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
+  DBUG_ASSERT(!is_referenced());
+
+  if (UT_LIST_GET_LEN(lock.trx_locks))
+  {
+    lock_release(this);
+    lock.n_rec_locks = 0;
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    mem_heap_empty(lock.lock_heap);
+  }
+
+  lock.table_locks.clear();
+}
+
+/** At shutdown, frees a transaction object. */
+void
+trx_free_at_shutdown(trx_t *trx)
+{
+	ut_ad(trx->is_recovered);
+	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
+	     || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+	     || (trx_state_eq(trx, TRX_STATE_ACTIVE)
+		 && (!srv_was_started
+		     || srv_operation == SRV_OPERATION_RESTORE
+		     || srv_operation == SRV_OPERATION_RESTORE_EXPORT
+		     || srv_read_only_mode
+		     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+		     || (!srv_is_being_started
+		         && !srv_undo_sources && srv_fast_shutdown))));
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	trx->commit_state();
+	trx->release_locks();
+	trx_undo_free_at_shutdown(trx);
+
+	ut_a(!trx->read_only);
+
+	DBUG_LOG("trx", "Free prepared: " << trx);
+	trx->state = TRX_STATE_NOT_STARTED;
+	ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
+	trx->id = 0;
+	trx->free();
+}
+
+
+/**
+  Disconnect a prepared transaction from MySQL
+  @param[in,out] trx transaction
+*/
+void trx_disconnect_prepared(trx_t *trx)
+{
+  ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+  ut_ad(trx->mysql_thd);
+  ut_ad(!trx->mysql_log_file_name);
+  trx->read_view.close();
+  trx->is_recovered= true;
+  trx->mysql_thd= NULL;
+  /* todo/fixme: suggest to do it at innodb prepare */
+  trx->will_lock= false;
+  trx_sys.rw_trx_hash.put_pins(trx);
+}
+
+/****************************************************************//**
+Resurrect the table locks for a resurrected transaction. */
+static
+void
+trx_resurrect_table_locks(
+/*======================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	const trx_undo_t*	undo)	/*!< in: undo log */
+{
+	mtr_t			mtr;
+	table_id_set		tables;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+	      trx_state_eq(trx, TRX_STATE_PREPARED));
+	ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+
+	if (undo->empty()) {
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	/* trx_rseg_mem_create() may have acquired an X-latch on this
+	page, so we cannot acquire an S-latch. */
+	buf_block_t* block = trx_undo_page_get(
+		page_id_t(trx->rsegs.m_redo.rseg->space->id,
+			  undo->top_page_no), &mtr);
+	buf_block_t* undo_block = block;
+	trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
+
+	do {
+		ulint		type;
+		undo_no_t	undo_no;
+		table_id_t	table_id;
+		ulint		cmpl_info;
+		bool		updated_extern;
+
+		if (undo_block != block) {
+			mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
+			undo_block = block;
+		}
+
+		trx_undo_rec_get_pars(
+			undo_rec, &type, &cmpl_info,
+			&updated_extern, &undo_no, &table_id);
+		tables.insert(table_id);
+
+		undo_rec = trx_undo_get_prev_rec(
+			block, page_offset(undo_rec), undo->hdr_page_no,
+			undo->hdr_offset, false, &mtr);
+	} while (undo_rec);
+
+	mtr_commit(&mtr);
+
+	for (table_id_set::const_iterator i = tables.begin();
+	     i != tables.end(); i++) {
+		if (dict_table_t* table = dict_table_open_on_id(
+			    *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
+			if (!table->is_readable()) {
+				mutex_enter(&dict_sys.mutex);
+				dict_table_close(table, TRUE, FALSE);
+				dict_sys.remove(table);
+				mutex_exit(&dict_sys.mutex);
+				continue;
+			}
+
+			if (trx->state == TRX_STATE_PREPARED) {
+				trx->mod_tables.insert(
+					trx_mod_tables_t::value_type(table,
+								     0));
+			}
+			lock_table_ix_resurrect(table, trx);
+
+			DBUG_LOG("ib_trx",
+				 "resurrect " << ib::hex(trx->id)
+				 << " IX lock on " << table->name);
+
+			dict_table_close(table, FALSE, FALSE);
+		}
+	}
+}
+
+
+/**
+  Resurrect the transactions that were doing inserts/updates the time of the
+  crash, they need to be undone.
+*/
+
+static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+                          time_t start_time, ulonglong start_time_micro,
+                          uint64_t *rows_to_undo)
+{
+  trx_state_t state;
+  /*
+    This is single-threaded startup code, we do not need the
+    protection of trx->mutex here.
+  */
+  switch (undo->state)
+  {
+  case TRX_UNDO_ACTIVE:
+    state= TRX_STATE_ACTIVE;
+    break;
+  case TRX_UNDO_PREPARED:
+    /*
+      Prepared transactions are left in the prepared state
+      waiting for a commit or abort decision from MySQL
+    */
+    ib::info() << "Transaction " << undo->trx_id
+               << " was in the XA prepared state.";
+
+    state= TRX_STATE_PREPARED;
+    break;
+  default:
+    return;
+  }
+
+  trx_t *trx= trx_create();
+  trx->state= state;
+  ut_d(trx->start_file= __FILE__);
+  ut_d(trx->start_line= __LINE__);
+
+  trx->rsegs.m_redo.undo= undo;
+  trx->undo_no= undo->top_undo_no + 1;
+  trx->rsegs.m_redo.rseg= rseg;
+  /*
+    For transactions with active data will not have rseg size = 1
+    or will not qualify for purge limit criteria. So it is safe to increment
+    this trx_ref_count w/o mutex protection.
+  */
+  ++trx->rsegs.m_redo.rseg->trx_ref_count;
+  *trx->xid= undo->xid;
+  trx->id= undo->trx_id;
+  trx->is_recovered= true;
+  trx->start_time= start_time;
+  trx->start_time_micro= start_time_micro;
+
+  if (undo->dict_operation)
+  {
+    trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+    if (!trx->table_id)
+      trx->table_id= undo->table_id;
+  }
+
+  trx_sys.rw_trx_hash.insert(trx);
+  trx_sys.rw_trx_hash.put_pins(trx);
+  trx_resurrect_table_locks(trx, undo);
+  if (trx_state_eq(trx, TRX_STATE_ACTIVE))
+    *rows_to_undo+= trx->undo_no;
+}
+
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start()
+{
+	ut_a(srv_is_being_started);
+	ut_ad(!srv_was_started);
+
+	if (srv_operation == SRV_OPERATION_RESTORE) {
+		/* mariabackup --prepare only deals with
+		the redo log and the data files, not with
+		transactions or the data dictionary. */
+		return trx_rseg_array_init();
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		return DB_SUCCESS;
+	}
+
+	purge_sys.create();
+	if (dberr_t err = trx_rseg_array_init()) {
+		ib::info() << "Retry with innodb_force_recovery=5";
+		return err;
+	}
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions. */
+	const time_t	start_time	= time(NULL);
+	const ulonglong	start_time_micro= microsecond_interval_timer();
+	uint64_t	rows_to_undo	= 0;
+
+	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_undo_t*	undo;
+		trx_rseg_t*	rseg = trx_sys.rseg_array[i];
+
+		/* Some rollback segment may be unavailable,
+		especially if the server was previously run with a
+		non-default value of innodb_undo_logs. */
+		if (rseg == NULL) {
+			continue;
+		}
+		/* Ressurrect other transactions. */
+		for (undo = UT_LIST_GET_FIRST(rseg->undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t *trx = trx_sys.find(0, undo->trx_id, false);
+			if (!trx) {
+				trx_resurrect(undo, rseg, start_time,
+					      start_time_micro, &rows_to_undo);
+			} else {
+				ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+				      trx_state_eq(trx, TRX_STATE_PREPARED));
+				ut_ad(trx->start_time == start_time);
+				ut_ad(trx->is_recovered);
+				ut_ad(trx->rsegs.m_redo.rseg == rseg);
+				ut_ad(trx->rsegs.m_redo.rseg->trx_ref_count);
+
+				trx->rsegs.m_redo.undo = undo;
+				if (undo->top_undo_no >= trx->undo_no) {
+					if (trx_state_eq(trx,
+							 TRX_STATE_ACTIVE)) {
+						rows_to_undo -= trx->undo_no;
+						rows_to_undo +=
+							undo->top_undo_no + 1;
+					}
+
+					trx->undo_no = undo->top_undo_no + 1;
+				}
+				trx_resurrect_table_locks(trx, undo);
+			}
+		}
+	}
+
+	if (const auto size = trx_sys.rw_trx_hash.size()) {
+		ib::info() << size
+			<< " transaction(s) which must be rolled back or"
+			" cleaned up in total " << rows_to_undo
+			<< " row operations to undo";
+		ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
+	}
+
+	purge_sys.clone_oldest_view();
+	return DB_SUCCESS;
+}
+
+/** Assign a persistent rollback segment in a round-robin fashion,
+evenly distributed between 0 and innodb_undo_logs-1
+@return	persistent rollback segment
+@retval	NULL	if innodb_read_only */
+static trx_rseg_t* trx_assign_rseg_low()
+{
+	if (high_level_read_only) {
+		ut_ad(!srv_available_undo_logs);
+		return(NULL);
+	}
+
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+	/* The first slot is always assigned to the system tablespace. */
+	ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
+
+	/* Choose a rollback segment evenly distributed between 0 and
+	innodb_undo_logs-1 in a round-robin fashion, skipping those
+	undo tablespaces that are scheduled for truncation. */
+	static Atomic_counter<unsigned>	rseg_slot;
+	unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+	trx_rseg_t*	rseg;
+
+#ifdef UNIV_DEBUG
+	ulint	start_scan_slot = slot;
+	bool	look_for_rollover = false;
+#endif /* UNIV_DEBUG */
+
+	bool	allocated = false;
+
+	do {
+		for (;;) {
+			rseg = trx_sys.rseg_array[slot];
+
+#ifdef UNIV_DEBUG
+			/* Ensure that we are not revisiting the same
+			slot that we have already inspected. */
+			if (look_for_rollover) {
+				ut_ad(start_scan_slot != slot);
+			}
+			look_for_rollover = true;
+#endif /* UNIV_DEBUG */
+
+			ut_d(if (!trx_rseg_n_slots_debug))
+			slot = (slot + 1) % TRX_SYS_N_RSEGS;
+
+			if (rseg == NULL) {
+				continue;
+			}
+
+			ut_ad(rseg->is_persistent());
+
+			if (rseg->space != fil_system.sys_space) {
+				if (rseg->skip_allocation
+				    || !srv_undo_tablespaces) {
+					continue;
+				}
+			} else if (trx_rseg_t* next
+				   = trx_sys.rseg_array[slot]) {
+				if (next->space != fil_system.sys_space
+				    && srv_undo_tablespaces > 0) {
+					/** If dedicated
+					innodb_undo_tablespaces have
+					been configured, try to use them
+					instead of the system tablespace. */
+					continue;
+				}
+			}
+
+			break;
+		}
+
+		/* By now we have only selected the rseg but not marked it
+		allocated. By marking it allocated we are ensuring that it will
+		never be selected for UNDO truncate purge. */
+		mutex_enter(&rseg->mutex);
+		if (!rseg->skip_allocation) {
+			rseg->trx_ref_count++;
+			allocated = true;
+		}
+		mutex_exit(&rseg->mutex);
+	} while (!allocated);
+
+	ut_ad(rseg->trx_ref_count > 0);
+	ut_ad(rseg->is_persistent());
+	return(rseg);
+}
+
+/** Assign a rollback segment for modifying temporary tables.
+@return the assigned rollback segment */
+trx_rseg_t *trx_t::assign_temp_rseg()
+{
+	ut_ad(!rsegs.m_noredo.rseg);
+	ut_ad(!is_autocommit_non_locking());
+	compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
+
+	/* Choose a temporary rollback segment between 0 and 127
+	in a round-robin fashion. */
+	static Atomic_counter<unsigned> rseg_slot;
+	trx_rseg_t*	rseg = trx_sys.temp_rsegs[
+		rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
+	ut_ad(!rseg->is_persistent());
+	rsegs.m_noredo.rseg = rseg;
+
+	if (id == 0) {
+		trx_sys.register_rw(this);
+	}
+
+	ut_ad(!rseg->is_persistent());
+	return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+	trx_t*	trx,		/*!< in: transaction */
+	bool	read_write)	/*!< in: true if read-write transaction */
+{
+	ut_ad(!trx->in_rollback);
+	ut_ad(!trx->is_recovered);
+	ut_ad(trx->start_line != 0);
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->roll_limit == 0);
+	ut_ad(trx->error_state == DB_SUCCESS);
+	ut_ad(trx->rsegs.m_redo.rseg == NULL);
+	ut_ad(trx->rsegs.m_noredo.rseg == NULL);
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	/* Check whether it is an AUTOCOMMIT SELECT */
+	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+	trx->read_only = srv_read_only_mode
+		|| (!trx->ddl && !trx->internal
+		    && thd_trx_is_read_only(trx->mysql_thd));
+
+	if (!trx->auto_commit) {
+		trx->will_lock = true;
+	} else if (!trx->will_lock) {
+		trx->read_only = true;
+	}
+
+#ifdef WITH_WSREP
+	trx->xid->null();
+#endif /* WITH_WSREP */
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(trx->lock.table_locks.empty());
+
+	/* No other thread can access this trx object through rw_trx_hash,
+	still it can be found through trx_sys.trx_list. Sometimes it's
+	possible to indirectly protect trx_t::state by freezing
+	trx_sys.trx_list.
+
+	For now we update it without mutex protection, because original code
+	did it this way. It has to be reviewed and fixed properly. */
+	trx->state = TRX_STATE_ACTIVE;
+
+	/* By default all transactions are in the read-only list unless they
+	are non-locking auto-commit read only transactions or background
+	(internal) transactions. Note: Transactions marked explicitly as
+	read only can write to temporary tables, we put those on the RO
+	list too. */
+
+	if (!trx->read_only
+	    && (trx->mysql_thd == 0 || read_write || trx->ddl)) {
+
+		/* Temporary rseg is assigned only if the transaction
+		updates a temporary table */
+		trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+		ut_ad(trx->rsegs.m_redo.rseg != 0
+		      || srv_read_only_mode
+		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+		trx_sys.register_rw(trx);
+	} else {
+		if (!trx->is_autocommit_non_locking()) {
+
+			/* If this is a read-only transaction that is writing
+			to a temporary table then it needs a transaction id
+			to write to the temporary table. */
+
+			if (read_write) {
+				ut_ad(!srv_read_only_mode);
+				trx_sys.register_rw(trx);
+			}
+		} else {
+			ut_ad(!read_write);
+		}
+	}
+
+	trx->start_time = time(NULL);
+	trx->start_time_micro = trx->mysql_thd
+		? thd_query_start_micro(trx->mysql_thd)
+		: microsecond_interval_timer();
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	MONITOR_INC(MONITOR_TRX_ACTIVE);
+}
+
+/** Set the serialisation number for a persistent committed transaction.
+@param[in,out]	trx	committed transaction with persistent changes */
+static
+void
+trx_serialise(trx_t* trx)
+{
+	trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
+	ut_ad(rseg);
+	ut_ad(mutex_own(&rseg->mutex));
+
+	if (rseg->last_page_no == FIL_NULL) {
+		mutex_enter(&purge_sys.pq_mutex);
+	}
+
+	trx_sys.assign_new_trx_no(trx);
+
+	/* If the rollback segment is not empty then the
+	new trx_t::no can't be less than any trx_t::no
+	already in the rollback segment. User threads only
+	produce events when a rollback segment is empty. */
+	if (rseg->last_page_no == FIL_NULL) {
+		purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
+							*rseg));
+		mutex_exit(&purge_sys.pq_mutex);
+	}
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment. */
+static
+void
+trx_write_serialisation_history(
+/*============================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE to some
+	other state: these modifications to the file data structure define
+	the transaction as committed in the file based domain, at the
+	serialization point of the log sequence number lsn obtained below. */
+
+	/* We have to hold the rseg mutex because update log headers have
+	to be put to the history list in the (serialisation) order of the
+	UNDO trx number. This is required for the purge in-memory data
+	structures too. */
+
+	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+		/* Undo log for temporary tables is discarded at transaction
+		commit. There is no purge for temporary tables, and also no
+		MVCC, because they are private to a session. */
+
+		mtr_t	temp_mtr;
+		temp_mtr.start();
+		temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+		mutex_enter(&trx->rsegs.m_noredo.rseg->mutex);
+		trx_undo_set_state_at_finish(undo, &temp_mtr);
+		mutex_exit(&trx->rsegs.m_noredo.rseg->mutex);
+		temp_mtr.commit();
+	}
+
+	trx_rseg_t*	rseg = trx->rsegs.m_redo.rseg;
+	if (!rseg) {
+		ut_ad(!trx->rsegs.m_redo.undo);
+		return;
+	}
+
+	trx_undo_t*& undo = trx->rsegs.m_redo.undo;
+
+	if (!undo) {
+		return;
+	}
+
+	ut_ad(!trx->read_only);
+	ut_ad(!undo || undo->rseg == rseg);
+	mutex_enter(&rseg->mutex);
+
+	/* Assign the transaction serialisation number and add any
+	undo log to the purge queue. */
+	trx_serialise(trx);
+	if (undo) {
+		UT_LIST_REMOVE(rseg->undo_list, undo);
+		trx_purge_add_undo_to_history(trx, undo, mtr);
+	}
+
+	mutex_exit(&rseg->mutex);
+
+	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+	fts_trx_table_t*	ftt)	    /* in: FTS trx table */
+{
+	fts_t*		  fts = ftt->table->fts;
+	fts_doc_ids_t*	  doc_ids = ftt->added_doc_ids;
+
+	ut_a(fts->add_wq);
+
+	mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+	ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+	/* fts_trx_table_t no longer owns the list. */
+	ftt->added_doc_ids = NULL;
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	is_commit)	/*!< in: true if the transaction was
+				committed, false if it was rolled back. */
+{
+	if (is_commit) {
+		const ib_rbt_node_t*	node;
+		ib_rbt_t*		tables;
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_last(trx->fts_trx->savepoints));
+
+		tables = savepoint->tables;
+
+		for (node = rbt_first(tables);
+		     node;
+		     node = rbt_next(tables, node)) {
+			fts_trx_table_t**	ftt;
+
+			ftt = rbt_value(fts_trx_table_t*, node);
+
+			if ((*ftt)->added_doc_ids) {
+				trx_finalize_for_fts_table(*ftt);
+			}
+		}
+	}
+
+	fts_trx_free(trx->fts_trx);
+	trx->fts_trx = NULL;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed_low(
+/*========================*/
+	lsn_t	lsn)	/*!< in: lsn up to which logs are to be
+			flushed. */
+{
+	bool	flush = srv_file_flush_method != SRV_NOSYNC;
+
+	switch (srv_flush_log_at_trx_commit) {
+	case 2:
+		/* Write the log but do not flush it to disk */
+		flush = false;
+		/* fall through */
+	case 1:
+	case 3:
+		/* Write the log and optionally flush it to disk */
+		log_write_up_to(lsn, flush);
+		srv_inc_activity_count();
+		return;
+	case 0:
+		/* Do nothing */
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed(
+/*====================*/
+	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
+			flushed. */
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "flushing log";
+	trx_flush_log_if_needed_low(lsn);
+	trx->op_info = "";
+}
+
+/**********************************************************************//**
+For each table that has been modified by the given transaction: update
+its dict_table_t::update_time with the current timestamp. Clear the list
+of the modified tables at the end. */
+static
+void
+trx_update_mod_tables_timestamp(
+/*============================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	/* consider using trx->start_time if calling time() is too
+	expensive here */
+	const time_t now = time(NULL);
+
+	trx_mod_tables_t::const_iterator	end = trx->mod_tables.end();
+
+	for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin();
+	     it != end;
+	     ++it) {
+
+		/* This could be executed by multiple threads concurrently
+		on the same table object. This is fine because time_t is
+		word size or less. And _purely_ _theoretically_, even if
+		time_t write is not atomic, likely the value of 'now' is
+		the same in all threads and even if it is not, getting a
+		"garbage" in table->update_time is justified because
+		protecting it with a latch here would be too performance
+		intrusive. */
+		dict_table_t* table = it->first;
+		table->update_time = now;
+	}
+
+	trx->mod_tables.clear();
+}
+
+/** Evict a table definition due to the rollback of ALTER TABLE.
+@param[in]	table_id	table identifier */
+void trx_t::evict_table(table_id_t table_id)
+{
+	ut_ad(in_rollback);
+
+	dict_table_t* table = dict_table_open_on_id(
+		table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+	if (!table) {
+		return;
+	}
+
+	if (!table->release()) {
+		/* This must be a DDL operation that is being rolled
+		back in an active connection. */
+		ut_a(table->get_ref_count() == 1);
+		ut_ad(!is_recovered);
+		ut_ad(mysql_thd);
+		return;
+	}
+
+	/* This table should only be locked by this transaction, if at all. */
+	ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
+	const bool locked = UT_LIST_GET_LEN(table->locks);
+	ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
+	dict_sys.remove(table, true, locked);
+	if (locked) {
+		UT_LIST_ADD_FIRST(lock.evicted_tables, table);
+	}
+}
+
+/** Mark a transaction committed in the main memory data structures. */
+inline void trx_t::commit_in_memory(const mtr_t *mtr)
+{
+  must_flush_log_later= false;
+  read_view.close();
+
+  if (is_autocommit_non_locking())
+  {
+    ut_ad(id == 0);
+    ut_ad(read_only);
+    ut_ad(!will_lock);
+    ut_a(!is_recovered);
+    ut_ad(!rsegs.m_redo.rseg);
+    ut_ad(mysql_thd);
+    ut_ad(state == TRX_STATE_ACTIVE);
+
+    /* Note: We are asserting without holding the lock mutex. But
+    that is OK because this transaction is not waiting and cannot
+    be rolled back and no new locks can (or should) be added
+    because it is flagged as a non-locking read-only transaction. */
+    ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+
+    /* This state change is not protected by any mutex, therefore
+    there is an inherent race here around state transition during
+    printouts. We ignore this race for the sake of efficiency.
+    However, the freezing of trx_sys.trx_list will protect the trx_t
+    instance and it cannot be removed from the trx_list and freed
+    without first unfreezing trx_list. */
+    state= TRX_STATE_NOT_STARTED;
+
+    MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+
+    DBUG_LOG("trx", "Autocommit in memory: " << this);
+  }
+  else
+  {
+#ifdef UNIV_DEBUG
+    if (!UT_LIST_GET_LEN(lock.trx_locks))
+      for (auto l : lock.table_locks)
+        ut_ad(!l);
+#endif /* UNIV_DEBUG */
+    commit_state();
+
+    if (id)
+    {
+      trx_sys.deregister_rw(this);
+
+      /* Wait for any implicit-to-explicit lock conversions to cease,
+      so that there will be no race condition in lock_release(). */
+      while (UNIV_UNLIKELY(is_referenced()))
+        ut_delay(srv_spin_wait_delay);
+    }
+    else
+      ut_ad(read_only || !rsegs.m_redo.rseg);
+
+    if (read_only || !rsegs.m_redo.rseg)
+    {
+      MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+    }
+    else
+    {
+      trx_update_mod_tables_timestamp(this);
+      MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+      is_recovered= false;
+    }
+
+    release_locks();
+    id= 0;
+    DEBUG_SYNC_C("after_trx_committed_in_memory");
+
+    while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+    {
+      UT_LIST_REMOVE(lock.evicted_tables, table);
+      dict_mem_table_free(table);
+    }
+  }
+
+  ut_ad(!rsegs.m_redo.undo);
+  ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+
+  if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
+  {
+    mutex_enter(&rseg->mutex);
+    ut_ad(rseg->trx_ref_count > 0);
+    --rseg->trx_ref_count;
+    mutex_exit(&rseg->mutex);
+  }
+
+  if (mtr)
+  {
+    if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+    {
+      ut_ad(undo->rseg == rsegs.m_noredo.rseg);
+      trx_undo_commit_cleanup(undo);
+      undo= nullptr;
+    }
+
+    /* NOTE that we could possibly make a group commit more efficient
+    here: call os_thread_yield here to allow also other trxs to come
+    to commit! */
+
+    /*-------------------------------------*/
+
+    /* Depending on the my.cnf options, we may now write the log
+    buffer to the log files, making the transaction durable if the OS
+    does not crash. We may also flush the log files to disk, making
+    the transaction durable also at an OS crash or a power outage.
+
+    The idea in InnoDB's group commit is that a group of transactions
+    gather behind a trx doing a physical disk write to log files, and
+    when that physical write has been completed, one of those
+    transactions does a write which commits the whole group. Note that
+    this group commit will only bring benefit if there are > 2 users
+    in the database. Then at least 2 users can gather behind one doing
+    the physical log write to disk.
+
+    If we are calling trx_t::commit() under prepare_commit_mutex, we
+    will delay possible log write and flush to a separate function
+    trx_commit_complete_for_mysql(), which is only called when the
+    thread has released the mutex. This is to make the group commit
+    algorithm to work. Otherwise, the prepare_commit mutex would
+    serialize all commits and prevent a group of transactions from
+    gathering. */
+
+    commit_lsn= mtr->commit_lsn();
+    if (!commit_lsn)
+      /* Nothing to be done. */;
+    else if (flush_log_later)
+      /* Do nothing yet */
+      must_flush_log_later= true;
+    else if (srv_flush_log_at_trx_commit)
+      trx_flush_log_if_needed(commit_lsn, this);
+  }
+
+  ut_ad(!rsegs.m_noredo.undo);
+
+  /* Free all savepoints, starting from the first. */
+  trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints);
+
+  trx_roll_savepoints_free(this, savep);
+
+  if (fts_trx)
+    trx_finalize_for_fts(this, undo_no != 0);
+
+#ifdef WITH_WSREP
+  /* Serialization history has been written and the transaction is
+  committed in memory, which makes this commit ordered. Release commit
+  order critical section. */
+  if (wsrep)
+  {
+    wsrep= false;
+    wsrep_commit_ordered(mysql_thd);
+  }
+  lock.was_chosen_as_wsrep_victim= false;
+#endif /* WITH_WSREP */
+  trx_mutex_enter(this);
+  dict_operation= TRX_DICT_OP_NONE;
+
+  DBUG_LOG("trx", "Commit in memory: " << this);
+  state= TRX_STATE_NOT_STARTED;
+
+  assert_freed();
+  trx_init(this);
+  trx_mutex_exit(this);
+
+  ut_a(error_state == DB_SUCCESS);
+  if (!srv_read_only_mode)
+    srv_wake_purge_thread_if_not_active();
+}
+
+/** Commit the transaction in a mini-transaction.
+@param mtr  mini-transaction (if there are any persistent modifications) */
+void trx_t::commit_low(mtr_t *mtr)
+{
+  ut_ad(!mtr || mtr->is_active());
+  ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
+  ut_ad(!mtr == (aborted || !has_logged()));
+  ut_ad(!mtr || !aborted);
+
+  /* undo_no is non-zero if we're doing the final commit. */
+  if (fts_trx && undo_no)
+  {
+    ut_a(!is_autocommit_non_locking());
+    /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of
+    dying. This is a possible scenario if there is a crash between
+    insert to DELETED table committing and transaction committing. The
+    fix would be able to return error from this function */
+    if (dberr_t error= fts_commit(this))
+      ut_a(error == DB_DUPLICATE_KEY);
+  }
+
+#ifndef DBUG_OFF
+  const bool debug_sync= mysql_thd && has_logged_persistent();
+#endif
+
+  if (mtr)
+  {
+    trx_write_serialisation_history(this, mtr);
+
+    /* The following call commits the mini-transaction, making the
+    whole transaction committed in the file-based world, at this log
+    sequence number. The transaction becomes 'durable' when we write
+    the log to disk, but in the logical sense the commit in the
+    file-based data structures (undo logs etc.) happens here.
+
+    NOTE that transaction numbers, which are assigned only to
+    transactions with an update undo log, do not necessarily come in
+    exactly the same order as commit lsn's, if the transactions have
+    different rollback segments. To get exactly the same order we
+    should hold the kernel mutex up to this point, adding to the
+    contention of the kernel mutex. However, if a transaction T2 is
+    able to see modifications made by a transaction T1, T2 will always
+    get a bigger transaction number and a bigger commit lsn than T1. */
+
+    mtr->commit();
+  }
+#ifndef DBUG_OFF
+  if (debug_sync)
+    DEBUG_SYNC_C("before_trx_state_committed_in_memory");
+#endif
+
+  commit_in_memory(mtr);
+}
+
+
+void trx_t::commit()
+{
+  mtr_t *mtr= nullptr;
+  mtr_t local_mtr;
+
+  if (has_logged())
+  {
+    mtr= &local_mtr;
+    local_mtr.start();
+  }
+  commit_low(mtr);
+}
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx->mutex
+	here, because the commit or rollback should be invoked for a
+	running (or recovered prepared) transaction that is associated
+	with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, true);
+		/* fall through */
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		/* If the trx is in a lock wait state, moves the waiting
+		query thread to the suspended state */
+
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			ut_a(trx->lock.wait_thr != NULL);
+			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
+			trx->lock.wait_thr = NULL;
+
+			trx->lock.que_state = TRX_QUE_RUNNING;
+		}
+
+		ut_ad(trx->lock.n_active_thrs == 1);
+		return;
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		trx_t*	trx;
+
+		node->state = COMMIT_NODE_WAIT;
+
+		trx = thr_get_trx(thr);
+
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
+
+		trx_commit_or_rollback_prepare(trx);
+
+		trx->lock.que_state = TRX_QUE_COMMITTING;
+		trx->commit();
+		ut_ad(trx->lock.wait_thr == NULL);
+		trx->lock.que_state = TRX_QUE_RUNNING;
+
+		thr = NULL;
+	} else {
+		ut_ad(node->state == COMMIT_NODE_WAIT);
+
+		node->state = COMMIT_NODE_SEND;
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ut_d(trx->start_file = __FILE__);
+		ut_d(trx->start_line = __LINE__);
+
+		trx_start_low(trx, true);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		trx->op_info = "committing";
+		trx->commit();
+		MONITOR_DEC(MONITOR_TRX_ACTIVE);
+		trx->op_info = "";
+		return(DB_SUCCESS);
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	if (trx->id != 0
+	    || !trx->must_flush_log_later
+	    || (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered)) {
+
+		return;
+	}
+
+	trx_flush_log_if_needed(trx->commit_lsn, trx);
+
+	trx->must_flush_log_later = false;
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		trx->undo_no = 0;
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+		if (trx->fts_trx != NULL) {
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+	ibool		newline;
+
+	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		fputs(", not started", f);
+		goto state_ok;
+	case TRX_STATE_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		goto state_ok;
+	}
+	fprintf(f, ", state %lu", (ulong) trx->state);
+	ut_ad(0);
+state_ok:
+	const char* op_info = trx->op_info;
+
+	if (*op_info) {
+		putc(' ', f);
+		fputs(op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	newline = TRUE;
+
+	/* trx->lock.que_state of an ACTIVE transaction may change
+	while we are not holding trx->mutex. We perform a dirty read
+	for performance reasons. */
+
+	switch (trx->lock.que_state) {
+	case TRX_QUE_RUNNING:
+		newline = FALSE; break;
+	case TRX_QUE_LOCK_WAIT:
+		fputs("LOCK WAIT ", f); break;
+	case TRX_QUE_ROLLING_BACK:
+		fputs("ROLLING BACK ", f); break;
+	case TRX_QUE_COMMITTING:
+		fputs("COMMITTING ", f); break;
+	default:
+		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+	}
+
+	if (n_trx_locks > 0 || heap_size > 400) {
+		newline = TRUE;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) n_trx_locks,
+			(ulong) heap_size,
+			(ulong) n_rec_locks);
+	}
+
+	if (trx->undo_no != 0) {
+		newline = TRUE;
+		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(
+			f, trx->mysql_thd, static_cast<uint>(max_query_len));
+	}
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys.mutex.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ut_ad(lock_mutex_own());
+
+	trx_print_low(f, trx, max_query_len,
+		      lock_number_of_rows_locked(&trx->lock),
+		      UT_LIST_GET_LEN(trx->lock.trx_locks),
+		      mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.mutex. */
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	lock_mutex_enter();
+	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	trx_print_low(f, trx, max_query_len,
+		      n_rec_locks, n_trx_locks, heap_size);
+}
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return TRUE if weight(a) >= weight(b) */
+bool
+trx_weight_ge(
+/*==========*/
+	const trx_t*	a,	/*!< in: transaction to be compared */
+	const trx_t*	b)	/*!< in: transaction to be compared */
+{
+	ibool	a_notrans_edit;
+	ibool	b_notrans_edit;
+
+	/* If mysql_thd is NULL for a transaction we assume that it has
+	not edited non-transactional tables. */
+
+	a_notrans_edit = a->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(a->mysql_thd);
+
+	b_notrans_edit = b->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(b->mysql_thd);
+
+	if (a_notrans_edit != b_notrans_edit) {
+
+		return(a_notrans_edit);
+	}
+
+	/* Either both had edited non-transactional tables or both had
+	not, we fall back to comparing the number of altered/locked
+	rows. */
+
+	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/** Prepare a transaction.
+@return	log sequence number that makes the XA PREPARE durable
+@retval	0	if no changes needed to be made durable */
+static lsn_t trx_prepare_low(trx_t *trx)
+{
+	ut_ad(!trx->is_recovered);
+
+	mtr_t	mtr;
+
+	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+		ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+
+		mtr.start();
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+		mutex_enter(&undo->rseg->mutex);
+		trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+		mutex_exit(&undo->rseg->mutex);
+
+		mtr.commit();
+	}
+
+	trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+	if (!undo) {
+		/* There were no changes to persistent tables. */
+		return(0);
+	}
+
+	trx_rseg_t*	rseg = trx->rsegs.m_redo.rseg;
+	ut_ad(undo->rseg == rseg);
+
+	mtr.start();
+
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE to
+	TRX_UNDO_PREPARED: these modifications to the file data
+	structure define the transaction as prepared in the file-based
+	world, at the serialization point of lsn. */
+
+	mutex_enter(&rseg->mutex);
+	trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+	mutex_exit(&rseg->mutex);
+
+	/* Make the XA PREPARE durable. */
+	mtr.commit();
+	ut_ad(mtr.commit_lsn() > 0);
+	return(mtr.commit_lsn());
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+static
+void
+trx_prepare(
+/*========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Only fresh user transactions can be prepared.
+	Recovered transactions cannot. */
+	ut_a(!trx->is_recovered);
+
+	lsn_t	lsn = trx_prepare_low(trx);
+
+	DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
+
+	ut_a(trx->state == TRX_STATE_ACTIVE);
+	trx_mutex_enter(trx);
+	trx->state = TRX_STATE_PREPARED;
+	trx_mutex_exit(trx);
+
+	if (lsn) {
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		We must not be holding any mutexes or latches here. */
+
+		trx_flush_log_if_needed(lsn, trx);
+	}
+}
+
+/** XA PREPARE a transaction.
+@param[in,out]	trx	transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx)
+{
+	trx_start_if_not_started_xa(trx, false);
+
+	trx->op_info = "preparing";
+
+	trx_prepare(trx);
+
+	trx->op_info = "";
+}
+
+
+struct trx_recover_for_mysql_callback_arg
+{
+  XID *xid_list;
+  uint len;
+  uint count;
+};
+
+
+static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
+  trx_recover_for_mysql_callback_arg *arg)
+{
+  DBUG_ASSERT(arg->len > 0);
+  mutex_enter(&element->mutex);
+  if (trx_t *trx= element->trx)
+  {
+    /*
+      The state of a read-write transaction can only change from ACTIVE to
+      PREPARED while we are holding the element->mutex. But since it is
+      executed at startup no state change should occur.
+    */
+    if (trx_state_eq(trx, TRX_STATE_PREPARED))
+    {
+      ut_ad(trx->is_recovered);
+      ut_ad(trx->id);
+      if (arg->count == 0)
+        ib::info() << "Starting recovery for XA transactions...";
+      XID& xid= arg->xid_list[arg->count];
+      if (arg->count++ < arg->len)
+      {
+        trx->state= TRX_STATE_PREPARED_RECOVERED;
+        ib::info() << "Transaction " << trx->id
+                   << " in prepared state after recovery";
+        ib::info() << "Transaction contains changes to " << trx->undo_no
+                   << " rows";
+        xid= *trx->xid;
+      }
+    }
+  }
+  mutex_exit(&element->mutex);
+  /* Do not terminate upon reaching arg->len; count all transactions */
+  return false;
+}
+
+
+static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
+  void*)
+{
+  mutex_enter(&element->mutex);
+  if (trx_t *trx= element->trx)
+  {
+    if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
+      trx->state= TRX_STATE_PREPARED;
+  }
+  mutex_exit(&element->mutex);
+  return false;
+}
+
+
+/**
+  Find prepared transaction objects for recovery.
+
+  @param[out]  xid_list  prepared transactions
+  @param[in]   len       number of slots in xid_list
+
+  @return number of prepared transactions stored in xid_list
+*/
+
+int trx_recover_for_mysql(XID *xid_list, uint len)
+{
+  trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
+
+  ut_ad(xid_list);
+  ut_ad(len);
+
+  /* Fill xid_list with PREPARED transactions. */
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
+  if (arg.count)
+  {
+    ib::info() << arg.count
+        << " transactions in prepared state after recovery";
+    /* After returning the full list, reset the state, because
+    init_server_components() wants to recover the collection of
+    transactions twice, by first calling tc_log->open() and then
+    ha_recover() directly. */
+    if (arg.count <= len)
+      trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
+  }
+  return int(std::min(arg.count, len));
+}
+
+
+struct trx_get_trx_by_xid_callback_arg
+{
+  const XID *xid;
+  trx_t *trx;
+};
+
+
+static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
+  trx_get_trx_by_xid_callback_arg *arg)
+{
+  my_bool found= 0;
+  mutex_enter(&element->mutex);
+  if (trx_t *trx= element->trx)
+  {
+    trx_mutex_enter(trx);
+    if (trx->is_recovered &&
+	(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+	 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
+        arg->xid->eq(reinterpret_cast<XID*>(trx->xid)))
+    {
+#ifdef WITH_WSREP
+      /* The commit of a prepared recovered Galera
+      transaction needs a valid trx->xid for
+      invoking trx_sys_update_wsrep_checkpoint(). */
+      if (!wsrep_is_wsrep_xid(trx->xid))
+#endif /* WITH_WSREP */
+      /* Invalidate the XID, so that subsequent calls will not find it. */
+      trx->xid->null();
+      arg->trx= trx;
+      found= 1;
+    }
+    trx_mutex_exit(trx);
+  }
+  mutex_exit(&element->mutex);
+  return found;
+}
+
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in]	xid	X/Open XA transaction identifier
+@return	transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval	NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid)
+{
+  trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
+
+  if (xid)
+    trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
+  return arg.trx;
+}
+
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write)	/*!< in: true if read write transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, read_write);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		if (trx->id == 0 && read_write) {
+			/* If the transaction is tagged as read-only then
+			it can only write to temp tables and for such
+			transactions we don't want to move them to the
+			trx_sys_t::rw_trx_hash. */
+			if (!trx->read_only) {
+				trx_set_rw_mode(trx);
+			}
+		}
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*==========================*/
+	trx_t*	trx,		/*!< in: transaction */
+	bool	read_write)	/*!< in: true if read write transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, read_write);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		if (read_write && trx->id == 0 && !trx->read_only) {
+			trx_set_rw_mode(trx);
+		}
+		return;
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts a transaction for internal processing. */
+void
+trx_start_internal_low(
+/*===================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* Ensure it is not flagged as an auto-commit-non-locking
+	transaction. */
+
+	trx->will_lock = true;
+
+	trx->internal = true;
+
+	trx_start_low(trx, true);
+}
+
+/** Starts a read-only transaction for internal processing.
+@param[in,out] trx	transaction to be started */
+void
+trx_start_internal_read_only_low(
+	trx_t*	trx)
+{
+	/* Ensure it is not flagged as an auto-commit-non-locking
+	transaction. */
+
+	trx->will_lock = true;
+
+	trx->internal = true;
+
+	trx_start_low(trx, false);
+}
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+void
+trx_start_for_ddl_low(
+/*==================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_dict_op_t	op)	/*!< in: dictionary operation type */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		/* Flag this transaction as a dictionary operation, so that
+		the data dictionary will be locked in crash recovery. */
+
+		trx_set_dict_operation(trx, op);
+		trx->ddl= true;
+		trx_start_internal_low(trx);
+		return;
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Set the transaction as a read-write transaction if it is not already
+tagged as such. Read-only transactions that are writing to temporary
+tables are assigned an ID and a rollback segment but are not added
+to the trx read-write list because their updates should not be visible
+to other transactions and therefore their changes can be ignored by
+by MVCC. */
+void
+trx_set_rw_mode(
+/*============*/
+	trx_t*		trx)		/*!< in/out: transaction that is RW */
+{
+	ut_ad(trx->rsegs.m_redo.rseg == 0);
+	ut_ad(!trx->is_autocommit_non_locking());
+	ut_ad(!trx->read_only);
+	ut_ad(trx->id == 0);
+
+	if (high_level_read_only) {
+		return;
+	}
+
+	trx->rsegs.m_redo.rseg = trx_assign_rseg_low();
+	ut_ad(trx->rsegs.m_redo.rseg != 0);
+
+	trx_sys.register_rw(trx);
+
+	/* So that we can see our own changes. */
+	if (trx->read_view.is_open()) {
+		trx->read_view.set_creator_trx_id(trx->id);
+	}
+}
+
+bool trx_t::has_stats_table_lock() const
+{
+  for (lock_list::const_iterator it= lock.table_locks.begin(),
+       end= lock.table_locks.end(); it != end; ++it)
+  {
+     const lock_t *lock= *it;
+     if (lock && lock->un_member.tab_lock.table->is_stats_table())
+       return true;
+  }
+
+  return false;
+}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000..3d2d9752
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,1401 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "log.h"
+
+/* How should the old versions in the history list be managed?
+   ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+	However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+	A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+	When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+	In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+	We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+   -------------------------------------------------------------------
+latches?
+-------
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
+	When the transaction performs modifications or rolls back, its
+undo log is protected by undo page latches.
+Only the thread that is associated with the transaction may hold multiple
+undo page latches at a time. Undo pages are always private to a single
+transaction. Other threads that are performing MVCC reads
+or checking for implicit locks will lock at most one undo page at a time
+in trx_undo_get_undo_rec_low().
+	When the transaction commits, its persistent undo log is added
+to the history list. If it is not suitable for reuse, its slot is reset.
+In both cases, an x-latch must be acquired on the rollback segment header page.
+	The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset);/*!< in: undo log header byte offset on page */
+
+/** Determine the start offset of undo log records of an undo log page.
+@param[in]	block	undo log page
+@param[in]	page_no		undo log header page number
+@param[in]	offset		undo log header offset
+@return start offset */
+static
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+                                 uint16_t offset)
+{
+  return page_no == block->page.id().page_no()
+    ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->frame)
+    : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+}
+
+/** Get the first undo log record on a page.
+@param[in]	block	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to first record
+@retval	NULL	if none exists */
+static trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+                            uint16_t offset)
+{
+  uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+  return start == trx_undo_page_get_end(block, page_no, offset)
+    ? nullptr : block->frame + start;
+}
+
+/** Get the last undo log record on a page.
+@param[in]	page	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to last record
+@retval	NULL	if none exists */
+static
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+                           uint16_t offset)
+{
+  uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+  return trx_undo_page_get_start(block, page_no, offset) == end
+    ? nullptr : block->frame + mach_read_from_2(block->frame + end - 2);
+}
+
+/** Get the previous record in an undo log from the previous page.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+                                     uint32_t page_no, uint16_t offset,
+                                     bool shared, mtr_t *mtr)
+{
+  uint32_t prev_page_no= flst_get_prev_addr(TRX_UNDO_PAGE_HDR +
+                                            TRX_UNDO_PAGE_NODE +
+                                            block->frame).page;
+
+  if (prev_page_no == FIL_NULL)
+    return nullptr;
+
+  block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+                      0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+  return trx_undo_page_get_last_rec(block, page_no, offset);
+}
+
+/** Get the previous undo log record.
+@param[in]	block	undo log page
+@param[in]	rec	undo log record
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to record
+@retval	NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+                           uint32_t page_no, uint16_t offset)
+{
+  ut_ad(block->frame == page_align(rec));
+  return rec == block->frame + trx_undo_page_get_start(block, page_no, offset)
+    ? nullptr
+    : block->frame + mach_read_from_2(rec - 2);
+}
+
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr)
+{
+  if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+                                                       block->frame + rec,
+                                                       page_no, offset))
+    return prev;
+
+  /* We have to go to the previous undo log page to look for the
+  previous record */
+
+  return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+                                              shared, mtr);
+}
+
+/** Get the next record in an undo log from the next page.
+@param[in,out]  block   undo log page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no,
+                                     uint16_t offset, ulint mode, mtr_t *mtr)
+{
+  if (page_no == block->page.id().page_no() &&
+      mach_read_from_2(block->frame + offset + TRX_UNDO_NEXT_LOG))
+    return NULL;
+
+  uint32_t next= flst_get_next_addr(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+				    block->frame).page;
+  if (next == FIL_NULL)
+    return NULL;
+
+  block= buf_page_get(page_id_t(block->page.id().space(), next), 0, mode, mtr);
+  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+  return trx_undo_page_get_first_rec(block, page_no, offset);
+}
+
+/** Get the next record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, mtr_t *mtr)
+{
+  if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no,
+                                                       offset))
+    return next;
+
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset,
+                                              RW_S_LATCH, mtr);
+}
+
+/** Get the first record in an undo log.
+@param[in]      space   undo log header space
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out]     block   undo log page
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+                       uint16_t offset, ulint mode, buf_block_t*& block,
+                       mtr_t *mtr)
+{
+  block = buf_page_get(page_id_t(space.id, page_no), 0, mode, mtr);
+  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+  if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+    return rec;
+
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+                                              mtr);
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out]	block	undo log page */
+void trx_undo_page_init(const buf_block_t &block)
+{
+  mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.frame),
+                  FIL_PAGE_UNDO_LOG);
+  static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.frame,
+                    0, 2);
+  mach_write_to_2(my_assume_aligned<2>
+                  (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame),
+                  TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame,
+                    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame, 2);
+  /* The following corresponds to flst_zero_both(), but without writing log. */
+  memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_PAGE + block.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_BYTE + block.frame, 0, 2);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_PAGE + block.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_BYTE + block.frame, 0, 2);
+  static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+                TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+  /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+  memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + block.frame, 0,
+                    srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                                     FIL_PAGE_DATA_END));
+}
+
+/** Look for a free slot for an undo log segment.
+@param rseg_header   rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
+{
+  ulint max_slots= TRX_RSEG_N_SLOTS;
+
+#ifdef UNIV_DEBUG
+  if (trx_rseg_n_slots_debug)
+    max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
+
+  for (ulint i= 0; i < max_slots; i++)
+    if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+      return i;
+
+  return ULINT_UNDEFINED;
+}
+
+/** Create an undo log segment.
+@param[in,out]	space		tablespace
+@param[in,out]	rseg_hdr	rollback segment header (x-latched)
+@param[out]	id		undo slot number
+@param[out]	err		error code
+@param[in,out]	mtr		mini-transaction
+@return	undo log block
+@retval	NULL	on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+                    dberr_t *err, mtr_t *mtr)
+{
+	buf_block_t*	block;
+	uint32_t	n_reserved;
+	bool		success;
+
+	const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+
+	if (slot_no == ULINT_UNDEFINED) {
+		ib::warn() << "Cannot find a free slot for an undo log. Do"
+			" you have too many active transactions running"
+			" concurrently?";
+
+		*err = DB_TOO_MANY_CONCURRENT_TRXS;
+		return NULL;
+	}
+
+	ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
+	success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+					   mtr);
+	if (!success) {
+		*err = DB_OUT_OF_FILE_SPACE;
+		return NULL;
+	}
+
+	/* Allocate a new file segment for the undo log */
+	block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+			    mtr, true);
+
+	space->release_free_extents(n_reserved);
+
+	if (block == NULL) {
+		*err = DB_OUT_OF_FILE_SPACE;
+		return NULL;
+	}
+
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	mtr->undo_create(*block);
+	trx_undo_page_init(*block);
+
+	mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		      + block->frame,
+		      TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+				       TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+				       + block->frame, 0U);
+
+	flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame,
+		  mtr);
+
+	flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		      block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+	*id = slot_no;
+	mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+		      + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->frame,
+		      block->page.id().page_no());
+
+	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
+	*err = DB_SUCCESS;
+	return block;
+}
+
+/** Initialize an undo log header.
+@param[in,out]  undo_page   undo log segment header page
+@param[in]      trx_id      transaction identifier
+@param[in,out]  mtr         mini-transaction
+@return header byte offset on page */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+                                       mtr_t* mtr)
+{
+  /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+  repurposed after upgrading to MariaDB 10.3. */
+  byte *undo_type= my_assume_aligned<2>
+    (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame);
+  ut_ad(mach_read_from_2(undo_type) <= 2);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+  byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+                                    undo_page->frame);
+  const uint16_t free= mach_read_from_2(start + 2);
+  static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+                "compatibility");
+  ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+  mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+  /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+  So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+  But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+  memcpy_aligned<2>(start + 2, start, 2);
+  mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+              start, 2);
+  uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                                      undo_page->frame);
+  alignas(4) byte buf[4];
+  mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+  mach_write_to_2(buf + 2, free);
+  static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+  static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+  mtr->memcpy(*undo_page, my_assume_aligned<4>
+              (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame),
+              buf, 4);
+  if (prev_log)
+    mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame,
+                  free);
+  mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+                                 undo_page->frame, trx_id);
+  /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+  mach_write_to_2(buf, 1);
+  memcpy_aligned<2>(buf + 2, start, 2);
+  static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+                "compatibility");
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+                                undo_page->frame, buf, 4);
+  /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+  if (prev_log)
+  {
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+    mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+                                   undo_page->frame, prev_log);
+    static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+                  "compatibility");
+    mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+    static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+                  FLST_NODE_SIZE, "compatibility");
+  }
+  else
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+  return free;
+}
+
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out]  block   undo header page
+@param[in]      offset  undo header record offset
+@param[in]      xid     distributed transaction identifier
+@param[in,out]  mtr     mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+                               const XID &xid, mtr_t *mtr)
+{
+  DBUG_ASSERT(xid.gtrid_length > 0);
+  DBUG_ASSERT(xid.bqual_length >= 0);
+  DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+  DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+  static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+                "gtrid and bqual don't fit xid data");
+  DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                               block->frame) == offset);
+
+  trx_ulogf_t* log_hdr= block->frame + offset;
+
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+                                 static_cast<uint32_t>(xid.formatID));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+                                 static_cast<uint32_t>(xid.gtrid_length));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+                                 static_cast<uint32_t>(xid.bqual_length));
+  const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+                                             + xid.bqual_length);
+  mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID],
+              xid.data, xid_length);
+  if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+    mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+                XIDDATASIZE - xid_length, 0);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
+{
+	xid->formatID=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_FORMAT));
+
+	xid->gtrid_length=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_TRID_LEN));
+
+	xid->bqual_length=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_BQUAL_LEN));
+
+	memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/** Allocate an undo log page.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any page latch
+@return	X-latched block if success
+@retval	NULL	on failure */
+buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
+{
+	trx_rseg_t*	rseg		= undo->rseg;
+	buf_block_t*	new_block	= NULL;
+	uint32_t	n_reserved;
+
+	/* When we add a page to an undo log, this is analogous to
+	a pessimistic insert in a B-tree, and we must reserve the
+	counterpart of the tree latch, which is the rseg mutex. */
+
+	mutex_enter(&rseg->mutex);
+
+	buf_block_t* header_block = trx_undo_page_get(
+		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+	if (!fsp_reserve_free_extents(&n_reserved, undo->rseg->space, 1,
+				      FSP_UNDO, mtr)) {
+		goto func_exit;
+	}
+
+	new_block = fseg_alloc_free_page_general(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+		+ header_block->frame,
+		undo->top_page_no + 1, FSP_UP, true, mtr, mtr);
+
+	rseg->space->release_free_extents(n_reserved);
+
+	if (!new_block) {
+		goto func_exit;
+	}
+
+	ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+	buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
+	undo->last_page_no = new_block->page.id().page_no();
+
+	mtr->undo_create(*new_block);
+	trx_undo_page_init(*new_block);
+
+	flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		      new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+	undo->size++;
+	rseg->curr_size++;
+
+func_exit:
+	mutex_exit(&rseg->mutex);
+	return(new_block);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+uint32_t
+trx_undo_free_page(
+/*===============*/
+	trx_rseg_t* rseg,	/*!< in: rollback segment */
+	bool	in_history,	/*!< in: TRUE if the undo log is in the history
+				list */
+	uint32_t hdr_page_no,	/*!< in: header page number */
+	uint32_t page_no,	/*!< in: page number to free: must not be the
+				header page */
+	mtr_t*	mtr)		/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+{
+	const ulint	space = rseg->space->id;
+
+	ut_a(hdr_page_no != page_no);
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	buf_block_t* undo_block = trx_undo_page_get(page_id_t(space, page_no),
+						    mtr);
+	buf_block_t* header_block = trx_undo_page_get(page_id_t(space,
+								hdr_page_no),
+						      mtr);
+
+	flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		    undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+	fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+		       + header_block->frame,
+		       rseg->space, page_no, mtr);
+	buf_page_free(rseg->space, page_no, mtr, __FILE__, __LINE__);
+
+	const fil_addr_t last_addr = flst_get_last(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_block->frame);
+	rseg->curr_size--;
+
+	if (in_history) {
+		buf_block_t* rseg_header = trx_rsegf_get(
+			rseg->space, rseg->page_no, mtr);
+		byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			+ rseg_header->frame;
+		uint32_t hist_size = mach_read_from_4(rseg_hist_size);
+		ut_ad(hist_size > 0);
+		mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+	}
+
+	return(last_addr.page);
+}
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any undo log page
+			or that has allocated the undo log page */
+void
+trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
+{
+	ut_ad(undo->hdr_page_no != undo->last_page_no);
+	ut_ad(undo->size > 0);
+
+	undo->last_page_no = trx_undo_free_page(
+		undo->rseg, false, undo->hdr_page_no, undo->last_page_no, mtr);
+
+	undo->size--;
+}
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out]	undo	undo log
+@param[in]	limit	all undo logs after this limit will be discarded
+@param[in]	is_temp	whether this is temporary undo log */
+void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp)
+{
+	mtr_t mtr;
+	ut_ad(is_temp == !undo.rseg->is_persistent());
+
+	for (;;) {
+		mtr.start();
+		if (is_temp) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		}
+
+		trx_undo_rec_t* trunc_here = NULL;
+		mutex_enter(&undo.rseg->mutex);
+		buf_block_t* undo_block = trx_undo_page_get(
+			page_id_t(undo.rseg->space->id, undo.last_page_no),
+			&mtr);
+		trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+			undo_block, undo.hdr_page_no, undo.hdr_offset);
+		while (rec) {
+			if (trx_undo_rec_get_undo_no(rec) < limit) {
+				goto func_exit;
+			}
+			/* Truncate at least this record off, maybe more */
+			trunc_here = rec;
+
+			rec = trx_undo_page_get_prev_rec(undo_block, rec,
+							 undo.hdr_page_no,
+							 undo.hdr_offset);
+		}
+
+		if (undo.last_page_no != undo.hdr_page_no) {
+			trx_undo_free_last_page(&undo, &mtr);
+			mutex_exit(&undo.rseg->mutex);
+			mtr.commit();
+			continue;
+		}
+
+func_exit:
+		mutex_exit(&undo.rseg->mutex);
+
+		if (trunc_here) {
+			mtr.write<2>(*undo_block,
+				     TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				     + undo_block->frame,
+				     ulint(trunc_here - undo_block->frame));
+		}
+
+		mtr.commit();
+		return;
+	}
+}
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_page_no	header page number
+@param[in]	hdr_offset	header offset on the page
+@param[in]	limit		first undo number to preserve
+(everything below the limit will be truncated) */
+void
+trx_undo_truncate_start(
+	trx_rseg_t*	rseg,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
+	undo_no_t	limit)
+{
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* last_rec;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (!limit) {
+		return;
+	}
+loop:
+	mtr_start(&mtr);
+
+	if (!rseg->is_persistent()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	}
+
+	buf_block_t* undo_page;
+	rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+				     RW_X_LATCH, undo_page, &mtr);
+	if (rec == NULL) {
+		/* Already empty */
+done:
+		mtr.commit();
+		return;
+	}
+
+	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+					      hdr_offset);
+	if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+		goto done;
+	}
+
+	if (undo_page->page.id().page_no() == hdr_page_no) {
+		uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
+						+ undo_page->frame);
+		if (end == 0) {
+			end = mach_read_from_2(TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_FREE
+					       + undo_page->frame);
+		}
+
+		mtr.write<2>(*undo_page, undo_page->frame + hdr_offset
+			     + TRX_UNDO_LOG_START, end);
+	} else {
+		trx_undo_free_page(rseg, true, hdr_page_no,
+				   undo_page->page.id().page_no(), &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	goto loop;
+}
+
+/** Frees an undo log segment which is not in the history list.
+@param undo	temporary undo log */
+static void trx_undo_seg_free(const trx_undo_t *undo)
+{
+	ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+	trx_rseg_t* const rseg = undo->rseg;
+	bool		finished;
+	mtr_t		mtr;
+	ut_ad(rseg->space == fil_system.temp_space);
+
+	do {
+		mtr.start();
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+		buf_block_t* block = trx_undo_page_get(
+			page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), &mtr);
+
+		fseg_header_t* file_seg = TRX_UNDO_SEG_HDR
+			+ TRX_UNDO_FSEG_HEADER + block->frame;
+
+		finished = fseg_free_step(file_seg, &mtr);
+
+		if (finished) {
+			/* Update the rseg header */
+			buf_block_t* rseg_header = trx_rsegf_get(
+				rseg->space, rseg->page_no, &mtr);
+			compile_time_assert(FIL_NULL == 0xffffffff);
+			memset(TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+			       + undo->id * TRX_RSEG_SLOT_SIZE +
+			       rseg_header->frame, 0xff, 4);
+			MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+		}
+
+		mtr.commit();
+	} while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/** Read an undo log when starting up the database.
+@param[in,out]	rseg		rollback segment
+@param[in]	id		rollback segment slot
+@param[in]	page_no		undo log segment page number
+@param[in,out]	max_trx_id	the largest observed transaction ID
+@return	the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no,
+                                trx_id_t &max_trx_id)
+{
+	mtr_t		mtr;
+	XID		xid;
+
+	ut_ad(id < TRX_RSEG_N_SLOTS);
+
+	mtr.start();
+	const buf_block_t* block = trx_undo_page_get(
+		page_id_t(rseg->space->id, page_no), &mtr);
+	const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_TYPE
+					       + block->frame);
+	if (UNIV_UNLIKELY(type > 2)) {
+corrupted_type:
+		sql_print_error("InnoDB: unsupported undo header type %u",
+				type);
+corrupted:
+		mtr.commit();
+		return nullptr;
+	}
+
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->frame);
+	if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
+	    offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+		sql_print_error("InnoDB: invalid undo header offset %u",
+				offset);
+		goto corrupted;
+	}
+
+	const trx_ulogf_t* const undo_header = block->frame + offset;
+	uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+					  + block->frame);
+	switch (state) {
+	case TRX_UNDO_ACTIVE:
+	case TRX_UNDO_PREPARED:
+		if (UNIV_LIKELY(type != 1)) {
+			break;
+		}
+		sql_print_error("InnoDB: upgrade from older version than"
+				" MariaDB 10.3 requires clean shutdown");
+		goto corrupted;
+	default:
+		sql_print_error("InnoDB: unsupported undo header state %u",
+				state);
+		goto corrupted;
+	case TRX_UNDO_TO_PURGE:
+		if (UNIV_UNLIKELY(type == 1)) {
+			goto corrupted_type;
+		}
+		/* fall through */
+	case TRX_UNDO_CACHED:
+		trx_id_t id = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header);
+		if (id >> 48) {
+			sql_print_error("InnoDB: corrupted TRX_NO %llx", id);
+			goto corrupted;
+		}
+		if (id > max_trx_id) {
+			max_trx_id = id;
+		}
+	}
+
+	/* Read X/Open XA transaction identification if it exists, or
+	set it to NULL. */
+
+	if (undo_header[TRX_UNDO_XID_EXISTS]) {
+		trx_undo_read_xid(undo_header, &xid);
+	} else {
+		xid.null();
+	}
+
+	trx_id_t trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+	if (trx_id >> 48) {
+		sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
+		goto corrupted;
+	}
+	if (trx_id > max_trx_id) {
+		max_trx_id = trx_id;
+	}
+
+	mutex_enter(&rseg->mutex);
+	trx_undo_t* undo = trx_undo_mem_create(
+		rseg, id, trx_id, &xid, page_no, offset);
+	mutex_exit(&rseg->mutex);
+	if (!undo) {
+		return undo;
+	}
+
+	undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
+	undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
+	undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+				  + block->frame);
+
+	fil_addr_t	last_addr = flst_get_last(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+
+	undo->last_page_no = last_addr.page;
+	undo->top_page_no = last_addr.page;
+
+	const buf_block_t* last = trx_undo_page_get(
+		page_id_t(rseg->space->id, undo->last_page_no), &mtr);
+
+	if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+		    last, page_no, offset)) {
+		undo->top_offset = static_cast<uint16_t>(rec - last->frame);
+		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+		ut_ad(!undo->empty());
+	} else {
+		undo->top_undo_no = IB_ID_MAX;
+		ut_ad(undo->empty());
+	}
+
+	undo->state = state;
+
+	if (state != TRX_UNDO_CACHED) {
+		UT_LIST_ADD_LAST(rseg->undo_list, undo);
+	} else {
+		UT_LIST_ADD_LAST(rseg->undo_cached, undo);
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	}
+
+	mtr.commit();
+	return undo;
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification */
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
+{
+	trx_undo_t*	undo;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	ut_a(id < TRX_RSEG_N_SLOTS);
+
+	undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
+
+	if (undo == NULL) {
+
+		return(NULL);
+	}
+
+	undo->id = id;
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->rseg = rseg;
+
+	undo->hdr_page_no = page_no;
+	undo->hdr_offset = offset;
+	undo->last_page_no = page_no;
+	undo->size = 1;
+
+	undo->top_undo_no = IB_ID_MAX;
+	undo->top_page_no = page_no;
+	undo->guess_block = NULL;
+	ut_ad(undo->empty());
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+	trx_undo_t*	undo,	/*!< in: undo log to init */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
+{
+	ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->hdr_offset = offset;
+	undo->top_undo_no = IB_ID_MAX;
+	ut_ad(undo->empty());
+}
+
+/** Create an undo log.
+@param[in,out]	trx	transaction
+@param[in,out]	rseg	rollback segment
+@param[out]	undo	undo log object
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return undo log block
+@retval	NULL	on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+		dberr_t* err, mtr_t* mtr)
+{
+	ulint		id;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	buf_block_t*	block = trx_undo_seg_create(
+		rseg->space,
+		trx_rsegf_get(rseg->space, rseg->page_no, mtr), &id, err, mtr);
+
+	if (!block) {
+		return NULL;
+	}
+
+	rseg->curr_size++;
+
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+	*undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid,
+				    block->page.id().page_no(), offset);
+	if (*undo == NULL) {
+		*err = DB_OUT_OF_MEMORY;
+		 /* FIXME: this will not free the undo block to the file */
+		return NULL;
+	} else if (rseg != trx->rsegs.m_redo.rseg) {
+		return block;
+	}
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		break;
+	case TRX_DICT_OP_INDEX:
+		/* Do not discard the table on recovery. */
+		trx->table_id = 0;
+		/* fall through */
+	case TRX_DICT_OP_TABLE:
+		(*undo)->table_id = trx->table_id;
+		(*undo)->dict_operation = TRUE;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_TABLE_ID,
+					       trx->table_id);
+	}
+
+	*err = DB_SUCCESS;
+	return block;
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/** Reuse a cached undo log block.
+@param[in,out]	trx	transaction
+@param[in,out]	rseg	rollback segment
+@param[out]	pundo	the undo log memory object
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	if none cached */
+static
+buf_block_t*
+trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
+		      mtr_t* mtr)
+{
+	ut_ad(mutex_own(&rseg->mutex));
+
+	trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+	if (!undo) {
+		return NULL;
+	}
+
+	ut_ad(undo->size == 1);
+	ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t*	block = buf_page_get(page_id_t(undo->rseg->space->id,
+						       undo->hdr_page_no),
+					     0, RW_X_LATCH, mtr);
+	if (!block) {
+		return NULL;
+	}
+
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	UT_LIST_REMOVE(rseg->undo_cached, undo);
+	MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+	*pundo = undo;
+
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+	trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset);
+
+	if (rseg != trx->rsegs.m_redo.rseg) {
+		return block;
+	}
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		return block;
+	case TRX_DICT_OP_INDEX:
+		/* Do not discard the table on recovery. */
+		trx->table_id = 0;
+		/* fall through */
+	case TRX_DICT_OP_TABLE:
+		undo->table_id = trx->table_id;
+		undo->dict_operation = TRUE;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+					       + TRX_UNDO_TABLE_ID,
+					       trx->table_id);
+	}
+
+	return block;
+}
+
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+{
+	ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+	trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+	if (undo) {
+		return buf_page_get_gen(
+			page_id_t(undo->rseg->space->id, undo->last_page_no),
+			0, RW_X_LATCH, undo->guess_block,
+			BUF_GET, __FILE__, __LINE__, mtr, err);
+	}
+
+	trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+
+	mutex_enter(&rseg->mutex);
+	buf_block_t* block = trx_undo_reuse_cached(
+		trx, rseg, &trx->rsegs.m_redo.undo, mtr);
+
+	if (!block) {
+		block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo,
+					err, mtr);
+		ut_ad(!block == (*err != DB_SUCCESS));
+		if (!block) {
+			goto func_exit;
+		}
+	} else {
+		*err = DB_SUCCESS;
+	}
+
+	UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
+
+func_exit:
+	mutex_exit(&rseg->mutex);
+	return block;
+}
+
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[in]	rseg	rollback segment
+@param[out]	undo	the undo log
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+		    dberr_t* err, mtr_t* mtr)
+{
+  const bool	is_temp __attribute__((unused)) = rseg == trx->rsegs.m_noredo.rseg;
+
+	ut_ad(rseg == trx->rsegs.m_redo.rseg
+	      || rseg == trx->rsegs.m_noredo.rseg);
+	ut_ad(undo == (is_temp
+		       ? &trx->rsegs.m_noredo.undo
+		       : &trx->rsegs.m_redo.undo));
+	ut_ad(mtr->get_log_mode()
+	      == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+
+	if (*undo) {
+		return buf_page_get_gen(
+			page_id_t(rseg->space->id, (*undo)->last_page_no),
+			0, RW_X_LATCH, (*undo)->guess_block,
+			BUF_GET, __FILE__, __LINE__, mtr, err);
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_too_many_trx",
+		*err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
+	);
+
+	mutex_enter(&rseg->mutex);
+
+	buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr);
+
+	if (!block) {
+		block = trx_undo_create(trx, rseg, undo, err, mtr);
+		ut_ad(!block == (*err != DB_SUCCESS));
+		if (!block) {
+			goto func_exit;
+		}
+	} else {
+		*err = DB_SUCCESS;
+	}
+
+	UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
+
+func_exit:
+	mutex_exit(&rseg->mutex);
+	return block;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+buf_block_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t* block = trx_undo_page_get(
+		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+	const uint16_t state = undo->size == 1
+		&& TRX_UNDO_PAGE_REUSE_LIMIT
+		> mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				   + block->frame)
+		? TRX_UNDO_CACHED
+		: TRX_UNDO_TO_PURGE;
+
+	undo->state = state;
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+		      + block->frame, state);
+	return block;
+}
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in]	rollback	false=XA PREPARE, true=XA ROLLBACK
+@param[in,out]	mtr		mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+				   mtr_t *mtr)
+{
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t* block = trx_undo_page_get(
+		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+
+	if (rollback) {
+		ut_ad(undo->state == TRX_UNDO_PREPARED);
+		mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+			      + block->frame, TRX_UNDO_ACTIVE);
+		return;
+	}
+
+	/*------------------------------*/
+	ut_ad(undo->state == TRX_UNDO_ACTIVE);
+	undo->state = TRX_UNDO_PREPARED;
+	undo->xid   = *trx->xid;
+	/*------------------------------*/
+
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame,
+		      undo->state);
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->frame);
+	mtr->write<1>(*block, block->frame + offset + TRX_UNDO_XID_EXISTS, 1U);
+
+	trx_undo_write_xid(block, offset, undo->xid, mtr);
+}
+
+/** Free temporary undo log after commit or rollback.
+The information is not needed after a commit or rollback, therefore
+the data can be discarded.
+@param undo     temporary undo log */
+void trx_undo_commit_cleanup(trx_undo_t *undo)
+{
+	trx_rseg_t*	rseg	= undo->rseg;
+	ut_ad(rseg->space == fil_system.temp_space);
+
+	mutex_enter(&rseg->mutex);
+
+	UT_LIST_REMOVE(rseg->undo_list, undo);
+
+	if (undo->state == TRX_UNDO_CACHED) {
+		UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+
+		/* Delete first the undo log segment in the file */
+		trx_undo_seg_free(undo);
+
+		ut_ad(rseg->curr_size > undo->size);
+		rseg->curr_size -= undo->size;
+
+		ut_free(undo);
+	}
+
+	mutex_exit(&rseg->mutex);
+}
+
+/** At shutdown, frees the undo logs of a transaction. */
+void trx_undo_free_at_shutdown(trx_t *trx)
+{
+	if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
+		switch (undo->state) {
+		case TRX_UNDO_PREPARED:
+			break;
+		case TRX_UNDO_CACHED:
+		case TRX_UNDO_TO_PURGE:
+			ut_ad(trx_state_eq(trx,
+					   TRX_STATE_COMMITTED_IN_MEMORY));
+			/* fall through */
+		case TRX_UNDO_ACTIVE:
+			/* trx_t::commit_state() assigns
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+			ut_a(!srv_was_started
+			     || srv_read_only_mode
+			     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+			     || srv_fast_shutdown);
+			break;
+		default:
+			ut_error;
+		}
+
+		UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo);
+		ut_free(undo);
+		undo = NULL;
+	}
+	if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
+		ut_a(undo->state == TRX_UNDO_PREPARED);
+
+		UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo);
+		ut_free(undo);
+		undo = NULL;
+	}
+}